Coverage Report

Created: 2025-04-11 06:14

/src/pcre2/src/pcre2_compile.c
Line
Count
Source (jump to first uncovered line)
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
932k
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
251k
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
/* In rare error cases debugging might require calling pcre2_printint(). */
51
52
#if 0
53
#ifdef EBCDIC
54
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
55
#else
56
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
57
#endif
58
#define CHAR_OUTPUT(c)      (c)
59
#define CHAR_OUTPUT_HEX(c)  (c)
60
#define CHAR_INPUT(c)       (c)
61
#define CHAR_INPUT_HEX(c)   (c)
62
#include "pcre2_printint_inc.h"
63
#undef PRINTABLE
64
#undef CHAR_OUTPUT
65
#undef CHAR_OUTPUT_HEX
66
#undef CHAR_INPUT
67
#define DEBUG_CALL_PRINTINT
68
#endif
69
70
/* Other debugging code can be enabled by these defines. */
71
72
/* #define DEBUG_SHOW_CAPTURES */
73
/* #define DEBUG_SHOW_PARSED */
74
75
/* There are a few things that vary with different code unit sizes. Handle them
76
by defining macros in order to minimize #if usage. */
77
78
#if PCRE2_CODE_UNIT_WIDTH == 8
79
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
80
25.3k
#define XDIGIT(c)                xdigitab[c]
81
82
#else  /* Either 16-bit or 32-bit */
83
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
84
85
#if PCRE2_CODE_UNIT_WIDTH == 16
86
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
87
88
#else  /* 32-bit */
89
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
90
#endif
91
#endif
92
93
/* Function definitions to allow mutual recursion */
94
95
static int
96
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
97
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
98
    open_capitem *, compile_block *, PCRE2_SIZE *);
99
100
static int
101
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
102
    compile_block *);
103
104
static BOOL
105
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
106
    compile_block *);
107
108
static int
109
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
110
    compile_block *, int *);
111
112
113
/*************************************************
114
*      Code parameters and static tables         *
115
*************************************************/
116
117
316k
#define MAX_GROUP_NUMBER   65535u
118
4.68M
#define MAX_REPEAT_COUNT   65535u
119
4.02M
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
120
121
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
122
different ways in the different pattern scans. The parsing and group-
123
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
124
aligned for this. Having defined the size in code units, we set up
125
C16_WORK_SIZE as the number of elements in the 16-bit vector.
126
127
During the first compiling phase, when determining how much memory is required,
128
the regex is partly compiled into this space, but the compiled parts are
129
discarded as soon as they can be, so that hopefully there will never be an
130
overrun. The code does, however, check for an overrun, which can occur for
131
pathological patterns. The size of the workspace depends on LINK_SIZE because
132
the length of compiled items varies with this.
133
134
In the real compile phase, this workspace is not currently used. */
135
136
64.8k
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
137
138
#define C16_WORK_SIZE \
139
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
140
141
/* A uint32_t vector is used for caching information about the size of
142
capturing groups, to improve performance. A default is created on the stack of
143
this size. */
144
145
5.35k
#define GROUPINFO_DEFAULT_SIZE 256
146
147
/* The overrun tests check for a slightly smaller size so that they detect the
148
overrun before it actually does run off the end of the data block. */
149
150
14.7M
#define WORK_SIZE_SAFETY_MARGIN (100)
151
152
/* This value determines the size of the initial vector that is used for
153
remembering named groups during the pre-compile. It is allocated on the stack,
154
but if it is too small, it is expanded, in a similar way to the workspace. The
155
value is the number of slots in the list. */
156
157
129k
#define NAMED_GROUP_LIST_SIZE  20
158
159
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
160
of uint32_t. For short patterns this lives on the stack, with this size. Heap
161
memory is used for longer patterns. */
162
163
61.8k
#define PARSED_PATTERN_DEFAULT_SIZE 1024
164
165
/* Maximum length value to check against when making sure that the variable
166
that holds the compiled pattern length does not overflow. We make it a bit less
167
than INT_MAX to allow for adding in group terminating code units, so that we
168
don't have to check them every time. */
169
170
14.3M
#define OFLOW_MAX (INT_MAX - 20)
171
172
/* Table of extra lengths for each of the meta codes. Must be kept in step with
173
the definitions above. For some items these values are a basic length to which
174
a variable amount has to be added. */
175
176
static unsigned char meta_extra_lengths[] = {
177
  0,             /* META_END */
178
  0,             /* META_ALT */
179
  0,             /* META_ATOMIC */
180
  0,             /* META_BACKREF - more if group is >= 10 */
181
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
182
  1,             /* META_BIGVALUE */
183
  3,             /* META_CALLOUT_NUMBER */
184
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
185
  0,             /* META_CAPTURE */
186
  0,             /* META_CIRCUMFLEX */
187
  0,             /* META_CLASS */
188
  0,             /* META_CLASS_EMPTY */
189
  0,             /* META_CLASS_EMPTY_NOT */
190
  0,             /* META_CLASS_END */
191
  0,             /* META_CLASS_NOT */
192
  0,             /* META_COND_ASSERT */
193
  SIZEOFFSET,    /* META_COND_DEFINE */
194
  1+SIZEOFFSET,  /* META_COND_NAME */
195
  1+SIZEOFFSET,  /* META_COND_NUMBER */
196
  1+SIZEOFFSET,  /* META_COND_RNAME */
197
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
198
  3,             /* META_COND_VERSION */
199
  SIZEOFFSET,    /* META_OFFSET */
200
  0,             /* META_SCS */
201
  1,             /* META_CAPTURE_NAME */
202
  1,             /* META_CAPTURE_NUMBER */
203
  0,             /* META_DOLLAR */
204
  0,             /* META_DOT */
205
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
206
  0,             /* META_KET */
207
  0,             /* META_NOCAPTURE */
208
  2,             /* META_OPTIONS */
209
  1,             /* META_POSIX */
210
  1,             /* META_POSIX_NEG */
211
  0,             /* META_RANGE_ESCAPED */
212
  0,             /* META_RANGE_LITERAL */
213
  SIZEOFFSET,    /* META_RECURSE */
214
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
215
  0,             /* META_SCRIPT_RUN */
216
  0,             /* META_LOOKAHEAD */
217
  0,             /* META_LOOKAHEADNOT */
218
  SIZEOFFSET,    /* META_LOOKBEHIND */
219
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
220
  0,             /* META_LOOKAHEAD_NA */
221
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
222
  1,             /* META_MARK - plus the string length */
223
  0,             /* META_ACCEPT */
224
  0,             /* META_FAIL */
225
  0,             /* META_COMMIT */
226
  1,             /* META_COMMIT_ARG - plus the string length */
227
  0,             /* META_PRUNE */
228
  1,             /* META_PRUNE_ARG - plus the string length */
229
  0,             /* META_SKIP */
230
  1,             /* META_SKIP_ARG - plus the string length */
231
  0,             /* META_THEN */
232
  1,             /* META_THEN_ARG - plus the string length */
233
  0,             /* META_ASTERISK */
234
  0,             /* META_ASTERISK_PLUS */
235
  0,             /* META_ASTERISK_QUERY */
236
  0,             /* META_PLUS */
237
  0,             /* META_PLUS_PLUS */
238
  0,             /* META_PLUS_QUERY */
239
  0,             /* META_QUERY */
240
  0,             /* META_QUERY_PLUS */
241
  0,             /* META_QUERY_QUERY */
242
  2,             /* META_MINMAX */
243
  2,             /* META_MINMAX_PLUS */
244
  2,             /* META_MINMAX_QUERY */
245
  0,             /* META_ECLASS_AND */
246
  0,             /* META_ECLASS_OR */
247
  0,             /* META_ECLASS_SUB */
248
  0,             /* META_ECLASS_XOR */
249
  0              /* META_ECLASS_NOT */
250
};
251
252
/* Types for skipping parts of a parsed pattern. */
253
254
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
255
256
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
257
variables, which are concerned with first and required code units. A value
258
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
259
matching xxcu variable is set, and the low valued bits are relevant. */
260
261
28.3M
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
262
7.16M
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
263
565k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
264
5.15M
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
265
266
/* These flags are used in the groupinfo vector. */
267
268
229k
#define GI_SET_FIXED_LENGTH    0x80000000u
269
14.8k
#define GI_NOT_FIXED_LENGTH    0x40000000u
270
1.51k
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
271
272
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
273
and is fast (a good compiler can turn it into a subtraction and unsigned
274
comparison). */
275
276
4.01M
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
277
278
/* Table to identify hex digits. The tables in chartables are dependent on the
279
locale, and may mark arbitrary characters as digits. We want to recognize only
280
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
281
costs 256 bytes, but it is a lot faster than doing character value tests (at
282
least in some simple cases I timed), and in some applications one wants PCRE2
283
to compile efficiently as well as match efficiently. The value in the table is
284
the binary hex digit value, or 0xff for non-hex digits. */
285
286
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
287
UTF-8 mode. */
288
289
#ifndef EBCDIC
290
static const uint8_t xdigitab[] =
291
  {
292
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
293
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
294
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
295
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
296
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
297
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
298
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
299
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
300
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
301
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
302
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
303
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
304
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
305
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
306
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
307
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
308
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
309
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
316
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
317
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
318
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
322
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
324
325
#else
326
327
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
328
329
static const uint8_t xdigitab[] =
330
  {
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
342
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
343
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
344
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
345
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
346
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
347
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
348
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
355
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
361
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
362
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
363
#endif  /* EBCDIC */
364
365
366
/* Table for handling alphanumeric escaped characters. Positive returns are
367
simple data values; negative values are for special things like \d and so on.
368
Zero means further processing is needed (for things like \x), or the escape is
369
invalid. */
370
371
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
372
in UTF-8 mode. It runs from '0' to 'z'. */
373
374
#ifndef EBCDIC
375
2.28M
#define ESCAPES_FIRST       CHAR_0
376
1.12M
#define ESCAPES_LAST        CHAR_z
377
516
#define UPPER_CASE(c)       (c-32)
378
379
static const short int escapes[] = {
380
    /* 0 */ 0,                       /* 1 */ 0,
381
    /* 2 */ 0,                       /* 3 */ 0,
382
    /* 4 */ 0,                       /* 5 */ 0,
383
    /* 6 */ 0,                       /* 7 */ 0,
384
    /* 8 */ 0,                       /* 9 */ 0,
385
    /* : */ ESCAPES_FIRST+0x0a,      /* ; */ ESCAPES_FIRST+0x0b,
386
    /* < */ ESCAPES_FIRST+0x0c,      /* = */ ESCAPES_FIRST+0x0d,
387
    /* > */ ESCAPES_FIRST+0x0e,      /* ? */ ESCAPES_FIRST+0x0f,
388
    /* @ */ ESCAPES_FIRST+0x10,      /* A */ -ESC_A,
389
    /* B */ -ESC_B,                  /* C */ -ESC_C,
390
    /* D */ -ESC_D,                  /* E */ -ESC_E,
391
    /* F */ 0,                       /* G */ -ESC_G,
392
    /* H */ -ESC_H,                  /* I */ 0,
393
    /* J */ 0,                       /* K */ -ESC_K,
394
    /* L */ 0,                       /* M */ 0,
395
    /* N */ -ESC_N,                  /* O */ 0,
396
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
397
    /* R */ -ESC_R,                  /* S */ -ESC_S,
398
    /* T */ 0,                       /* U */ 0,
399
    /* V */ -ESC_V,                  /* W */ -ESC_W,
400
    /* X */ -ESC_X,                  /* Y */ 0,
401
    /* Z */ -ESC_Z,                  /* [ */ ESCAPES_FIRST+0x2b,
402
    /* \ */ ESCAPES_FIRST+0x2c,      /* ] */ ESCAPES_FIRST+0x2d,
403
    /* ^ */ ESCAPES_FIRST+0x2e,      /* _ */ ESCAPES_FIRST+0x2f,
404
    /* ` */ ESCAPES_FIRST+0x30,      /* a */ CHAR_BEL,
405
    /* b */ -ESC_b,                  /* c */ 0,
406
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
407
    /* f */ CHAR_FF,                 /* g */ 0,
408
    /* h */ -ESC_h,                  /* i */ 0,
409
    /* j */ 0,                       /* k */ -ESC_k,
410
    /* l */ 0,                       /* m */ 0,
411
    /* n */ CHAR_LF,                 /* o */ 0,
412
    /* p */ -ESC_p,                  /* q */ 0,
413
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
414
    /* t */ CHAR_HT,                 /* u */ 0,
415
    /* v */ -ESC_v,                  /* w */ -ESC_w,
416
    /* x */ 0,                       /* y */ 0,
417
    /* z */ -ESC_z
418
};
419
420
#else
421
422
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
423
It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,
424
which can interpret character literals like 'a' or '[' in an EBCDIC codepage;
425
in this case, there is wide variance between codepages on the interpretation of
426
characters between the letters ('[' and '{' and so on are placed in all sorts of
427
different positions in the table). Thankfully however, all EBCDIC codepages
428
place the letters and digits in the same location, so we hardcode that here.
429
Our EBCDIC support can also be provided via numeric literals instead of
430
character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled
431
in EBCDIC mode. */
432
433
#define ESCAPES_FIRST       CHAR_a
434
#define ESCAPES_LAST        CHAR_9
435
#define UPPER_CASE(c)       (c+64)
436
437
static const short int escapes[] = {
438
    /* 0x81 a */ CHAR_BEL,             /* 0x82 b */ -ESC_b,
439
    /* 0x83 c */ 0,                    /* 0x84 d */ -ESC_d,
440
    /* 0x85 e */ CHAR_ESC,             /* 0x86 f */ CHAR_FF,
441
    /* 0x87 g */ 0,                    /* 0x88 h */ -ESC_h,
442
    /* 0x89 i */ 0,                    /* 0x8a   */ ESCAPES_FIRST+0x09,
443
    /* 0x8b   */ ESCAPES_FIRST+0x0a,   /* 0x8c   */ ESCAPES_FIRST+0x0b,
444
    /* 0x8d   */ ESCAPES_FIRST+0x0c,   /* 0x8e   */ ESCAPES_FIRST+0x0d,
445
    /* 0x8f   */ ESCAPES_FIRST+0x0e,   /* 0x90   */ ESCAPES_FIRST+0x0f,
446
    /* 0x91 j */ 0,                    /* 0x92 k */ -ESC_k,
447
    /* 0x93 l */ 0,                    /* 0x94 m */ 0,
448
    /* 0x95 n */ CHAR_LF,              /* 0x96 o */ 0,
449
    /* 0x97 p */ -ESC_p,               /* 0x98 q */ 0,
450
    /* 0x99 r */ CHAR_CR,              /* 0x9a   */ ESCAPES_FIRST+0x19,
451
    /* 0x9b   */ ESCAPES_FIRST+0x1a,   /* 0x9c   */ ESCAPES_FIRST+0x1b,
452
    /* 0x9d   */ ESCAPES_FIRST+0x1c,   /* 0x9e   */ ESCAPES_FIRST+0x1d,
453
    /* 0x9f   */ ESCAPES_FIRST+0x1e,   /* 0xa0   */ ESCAPES_FIRST+0x1f,
454
    /* 0xa1   */ ESCAPES_FIRST+0x20,   /* 0xa2 s */ -ESC_s,
455
    /* 0xa3 t */ CHAR_HT,              /* 0xa4 u */ 0,
456
    /* 0xa5 v */ -ESC_v,               /* 0xa6 w */ -ESC_w,
457
    /* 0xa7 x */ 0,                    /* 0xa8 y */ 0,
458
    /* 0xa9 z */ -ESC_z,               /* 0xaa   */ ESCAPES_FIRST+0x29,
459
    /* 0xab   */ ESCAPES_FIRST+0x2a,   /* 0xac   */ ESCAPES_FIRST+0x2b,
460
    /* 0xad   */ ESCAPES_FIRST+0x2c,   /* 0xae   */ ESCAPES_FIRST+0x2d,
461
    /* 0xaf   */ ESCAPES_FIRST+0x2e,   /* 0xb0   */ ESCAPES_FIRST+0x2f,
462
    /* 0xb1   */ ESCAPES_FIRST+0x30,   /* 0xb2   */ ESCAPES_FIRST+0x31,
463
    /* 0xb3   */ ESCAPES_FIRST+0x32,   /* 0xb4   */ ESCAPES_FIRST+0x33,
464
    /* 0xb5   */ ESCAPES_FIRST+0x34,   /* 0xb6   */ ESCAPES_FIRST+0x35,
465
    /* 0xb7   */ ESCAPES_FIRST+0x36,   /* 0xb8   */ ESCAPES_FIRST+0x37,
466
    /* 0xb9   */ ESCAPES_FIRST+0x38,   /* 0xba   */ ESCAPES_FIRST+0x39,
467
    /* 0xbb   */ ESCAPES_FIRST+0x3a,   /* 0xbc   */ ESCAPES_FIRST+0x3b,
468
    /* 0xbd   */ ESCAPES_FIRST+0x3c,   /* 0xbe   */ ESCAPES_FIRST+0x3d,
469
    /* 0xbf   */ ESCAPES_FIRST+0x3e,   /* 0xc0   */ ESCAPES_FIRST+0x3f,
470
    /* 0xc1 A */ -ESC_A,               /* 0xc2 B */ -ESC_B,
471
    /* 0xc3 C */ -ESC_C,               /* 0xc4 D */ -ESC_D,
472
    /* 0xc5 E */ -ESC_E,               /* 0xc6 F */ 0,
473
    /* 0xc7 G */ -ESC_G,               /* 0xc8 H */ -ESC_H,
474
    /* 0xc9 I */ 0,                    /* 0xca   */ ESCAPES_FIRST+0x49,
475
    /* 0xcb   */ ESCAPES_FIRST+0x4a,   /* 0xcc   */ ESCAPES_FIRST+0x4b,
476
    /* 0xcd   */ ESCAPES_FIRST+0x4c,   /* 0xce   */ ESCAPES_FIRST+0x4d,
477
    /* 0xcf   */ ESCAPES_FIRST+0x4e,   /* 0xd0   */ ESCAPES_FIRST+0x4f,
478
    /* 0xd1 J */ 0,                    /* 0xd2 K */ -ESC_K,
479
    /* 0xd3 L */ 0,                    /* 0xd4 M */ 0,
480
    /* 0xd5 N */ -ESC_N,               /* 0xd6 O */ 0,
481
    /* 0xd7 P */ -ESC_P,               /* 0xd8 Q */ -ESC_Q,
482
    /* 0xd9 R */ -ESC_R,               /* 0xda   */ ESCAPES_FIRST+0x59,
483
    /* 0xdb   */ ESCAPES_FIRST+0x5a,   /* 0xdc   */ ESCAPES_FIRST+0x5b,
484
    /* 0xdd   */ ESCAPES_FIRST+0x5c,   /* 0xde   */ ESCAPES_FIRST+0x5d,
485
    /* 0xdf   */ ESCAPES_FIRST+0x5e,   /* 0xe0   */ ESCAPES_FIRST+0x5f,
486
    /* 0xe1   */ ESCAPES_FIRST+0x60,   /* 0xe2 S */ -ESC_S,
487
    /* 0xe3 T */ 0,                    /* 0xe4 U */ 0,
488
    /* 0xe5 V */ -ESC_V,               /* 0xe6 W */ -ESC_W,
489
    /* 0xe7 X */ -ESC_X,               /* 0xe8 Y */ 0,
490
    /* 0xe9 Z */ -ESC_Z,               /* 0xea   */ ESCAPES_FIRST+0x69,
491
    /* 0xeb   */ ESCAPES_FIRST+0x6a,   /* 0xec   */ ESCAPES_FIRST+0x6b,
492
    /* 0xed   */ ESCAPES_FIRST+0x6c,   /* 0xee   */ ESCAPES_FIRST+0x6d,
493
    /* 0xef   */ ESCAPES_FIRST+0x6e,   /* 0xf0 0 */ 0,
494
    /* 0xf1 1 */ 0,                    /* 0xf2 2 */ 0,
495
    /* 0xf3 3 */ 0,                    /* 0xf4 4 */ 0,
496
    /* 0xf5 5 */ 0,                    /* 0xf6 6 */ 0,
497
    /* 0xf7 7 */ 0,                    /* 0xf8 8 */ 0,
498
    /* 0xf9 9 */ 0,
499
};
500
501
/* We also need a table of characters that may follow \c in an EBCDIC
502
environment for characters 0-31. */
503
504
static unsigned char ebcdic_escape_c[] = {
505
  CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,
506
  CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,
507
  CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,
508
  CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
509
  CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE
510
};
511
512
#endif   /* EBCDIC */
513
514
515
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
516
searched linearly. Put all the names into a single string, in order to reduce
517
the number of relocations when a shared library is dynamically linked. The
518
string is built from string macros so that it works in UTF-8 mode on EBCDIC
519
platforms. */
520
521
typedef struct verbitem {
522
  unsigned int len;          /* Length of verb name */
523
  uint32_t meta;             /* Base META_ code */
524
  int has_arg;               /* Argument requirement */
525
} verbitem;
526
527
static const char verbnames[] =
528
  "\0"                       /* Empty name is a shorthand for MARK */
529
  STRING_MARK0
530
  STRING_ACCEPT0
531
  STRING_F0
532
  STRING_FAIL0
533
  STRING_COMMIT0
534
  STRING_PRUNE0
535
  STRING_SKIP0
536
  STRING_THEN;
537
538
static const verbitem verbs[] = {
539
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
540
  { 4, META_MARK,   +1 },
541
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
542
  { 1, META_FAIL,   -1 },
543
  { 4, META_FAIL,   -1 },
544
  { 6, META_COMMIT,  0 },
545
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
546
  { 4, META_SKIP,    0 },
547
  { 4, META_THEN,    0 }
548
};
549
550
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
551
552
/* Verb opcodes, indexed by their META code offset from META_MARK. */
553
554
static const uint32_t verbops[] = {
555
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
556
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
557
558
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
559
560
typedef struct alasitem {
561
  unsigned int len;          /* Length of name */
562
  uint32_t meta;             /* Base META_ code */
563
} alasitem;
564
565
static const char alasnames[] =
566
  STRING_pla0
567
  STRING_plb0
568
  STRING_napla0
569
  STRING_naplb0
570
  STRING_nla0
571
  STRING_nlb0
572
  STRING_positive_lookahead0
573
  STRING_positive_lookbehind0
574
  STRING_non_atomic_positive_lookahead0
575
  STRING_non_atomic_positive_lookbehind0
576
  STRING_negative_lookahead0
577
  STRING_negative_lookbehind0
578
  STRING_scs0
579
  STRING_scan_substring0
580
  STRING_atomic0
581
  STRING_sr0
582
  STRING_asr0
583
  STRING_script_run0
584
  STRING_atomic_script_run;
585
586
static const alasitem alasmeta[] = {
587
  {  3, META_LOOKAHEAD         },
588
  {  3, META_LOOKBEHIND        },
589
  {  5, META_LOOKAHEAD_NA      },
590
  {  5, META_LOOKBEHIND_NA     },
591
  {  3, META_LOOKAHEADNOT      },
592
  {  3, META_LOOKBEHINDNOT     },
593
  { 18, META_LOOKAHEAD         },
594
  { 19, META_LOOKBEHIND        },
595
  { 29, META_LOOKAHEAD_NA      },
596
  { 30, META_LOOKBEHIND_NA     },
597
  { 18, META_LOOKAHEADNOT      },
598
  { 19, META_LOOKBEHINDNOT     },
599
  {  3, META_SCS               },
600
  { 14, META_SCS               },
601
  {  6, META_ATOMIC            },
602
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
603
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
604
  { 10, META_SCRIPT_RUN        }, /* script run */
605
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
606
};
607
608
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
609
610
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
611
612
static uint32_t chartypeoffset[] = {
613
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
614
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
615
616
/* Tables of names of POSIX character classes and their lengths. The names are
617
now all in a single string, to reduce the number of relocations when a shared
618
library is dynamically loaded. The list of lengths is terminated by a zero
619
length entry. The first three must be alpha, lower, upper, as this is assumed
620
for handling case independence.
621
622
The indices for several classes are stored in pcre2_compile.h - these must
623
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
624
and posix_substitutes. */
625
626
static const char posix_names[] =
627
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
628
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
629
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
630
  STRING_word0  STRING_xdigit;
631
632
static const uint8_t posix_name_lengths[] = {
633
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
634
635
/* Table of class bit maps for each POSIX class. Each class is formed from a
636
base map, with an optional addition or removal of another map. Then, for some
637
classes, there is some additional tweaking: for [:blank:] the vertical space
638
characters are removed, and for [:alpha:] and [:alnum:] the underscore
639
character is removed. The triples in the table consist of the base map offset,
640
second map offset or -1 if no second map, and a non-negative value for map
641
addition or a negative value for map subtraction (if there are two maps). The
642
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
643
remove vertical space characters, 2 => remove underscore. */
644
645
const int PRIV(posix_class_maps)[] = {
646
  cbit_word,   cbit_digit, -2,            /* alpha */
647
  cbit_lower,  -1,          0,            /* lower */
648
  cbit_upper,  -1,          0,            /* upper */
649
  cbit_word,   -1,          2,            /* alnum - word without underscore */
650
  cbit_print,  cbit_cntrl,  0,            /* ascii */
651
  cbit_space,  -1,          1,            /* blank - a GNU extension */
652
  cbit_cntrl,  -1,          0,            /* cntrl */
653
  cbit_digit,  -1,          0,            /* digit */
654
  cbit_graph,  -1,          0,            /* graph */
655
  cbit_print,  -1,          0,            /* print */
656
  cbit_punct,  -1,          0,            /* punct */
657
  cbit_space,  -1,          0,            /* space */
658
  cbit_word,   -1,          0,            /* word - a Perl extension */
659
  cbit_xdigit, -1,          0             /* xdigit */
660
};
661
662
#ifdef SUPPORT_UNICODE
663
664
/* The POSIX class Unicode property substitutes that are used in UCP mode must
665
be in the order of the POSIX class names, defined above. */
666
667
static int posix_substitutes[] = {
668
  PT_GC, ucp_L,     /* alpha */
669
  PT_PC, ucp_Ll,    /* lower */
670
  PT_PC, ucp_Lu,    /* upper */
671
  PT_ALNUM, 0,      /* alnum */
672
  -1, 0,            /* ascii, treat as non-UCP */
673
  -1, 1,            /* blank, treat as \h */
674
  PT_PC, ucp_Cc,    /* cntrl */
675
  PT_PC, ucp_Nd,    /* digit */
676
  PT_PXGRAPH, 0,    /* graph */
677
  PT_PXPRINT, 0,    /* print */
678
  PT_PXPUNCT, 0,    /* punct */
679
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
680
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
681
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
682
};
683
#endif  /* SUPPORT_UNICODE */
684
685
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
686
are allowed. */
687
688
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
689
64.8k
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
690
64.8k
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
691
64.8k
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
692
693
#define PUBLIC_COMPILE_OPTIONS \
694
64.8k
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
695
64.8k
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
696
64.8k
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
697
64.8k
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
698
64.8k
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
699
64.8k
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
700
64.8k
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
701
702
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
703
64.8k
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
704
64.8k
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
705
706
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
707
64.8k
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
708
64.8k
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
709
64.8k
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
710
64.8k
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
711
64.8k
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
712
64.8k
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
713
64.8k
    PCRE2_EXTRA_NEVER_CALLOUT)
714
715
/* This is a table of start-of-pattern options such as (*UTF) and settings such
716
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
717
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
718
generic and always supported. */
719
720
enum { PSO_OPT,     /* Value is an option bit */
721
       PSO_XOPT,    /* Value is an xoption bit */
722
       PSO_FLG,     /* Value is a flag bit */
723
       PSO_NL,      /* Value is a newline type */
724
       PSO_BSR,     /* Value is a \R type */
725
       PSO_LIMH,    /* Read integer value for heap limit */
726
       PSO_LIMM,    /* Read integer value for match limit */
727
       PSO_LIMD,    /* Read integer value for depth limit */
728
       PSO_OPTMZ    /* Value is an optimization bit */
729
     };
730
731
typedef struct pso {
732
  const char *name;
733
  uint16_t length;
734
  uint16_t type;
735
  uint32_t value;
736
} pso;
737
738
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
739
740
static const pso pso_list[] = {
741
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
742
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
743
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
744
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
745
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
746
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
747
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
748
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
749
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
750
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
751
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
752
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
753
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
754
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
755
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
756
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
757
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
758
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
759
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
760
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
761
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
762
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
763
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
764
};
765
766
/* This table is used when converting repeating opcodes into possessified
767
versions as a result of an explicit possessive quantifier such as ++. A zero
768
value means there is no possessified version - in those cases the item in
769
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
770
because all relevant opcodes are less than that. */
771
772
static const uint8_t opcode_possessify[] = {
773
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
774
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
775
776
  0,                       /* NOTI */
777
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
778
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
779
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
780
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
781
  0,                       /* EXACT */
782
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
783
784
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
785
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
786
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
787
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
788
  0,                       /* EXACTI */
789
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
790
791
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
792
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
793
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
794
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
795
  0,                       /* NOTEXACT */
796
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
797
798
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
799
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
800
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
801
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
802
  0,                       /* NOTEXACTI */
803
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
804
805
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
806
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
807
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
808
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
809
  0,                       /* TYPEEXACT */
810
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
811
812
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
813
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
814
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
815
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
816
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
817
818
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
819
  0, 0,                    /* REF, REFI */
820
  0, 0,                    /* DNREF, DNREFI */
821
  0, 0,                    /* RECURSE, CALLOUT */
822
};
823
824
/* Compile-time check that the table has the correct size. */
825
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
826
827
828
#ifdef DEBUG_SHOW_PARSED
829
/*************************************************
830
*     Show the parsed pattern for debugging      *
831
*************************************************/
832
833
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
834
can be enabled. */
835
836
static void show_parsed(compile_block *cb)
837
{
838
uint32_t *pptr = cb->parsed_pattern;
839
840
for (;;)
841
  {
842
  int max, min;
843
  PCRE2_SIZE offset;
844
  uint32_t i;
845
  uint32_t length;
846
  uint32_t meta_arg = META_DATA(*pptr);
847
848
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
849
850
  if (*pptr < META_END)
851
    {
852
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
853
    pptr++;
854
    }
855
856
  else switch (META_CODE(*pptr++))
857
    {
858
    default:
859
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
860
    return;
861
862
    case META_END:
863
    fprintf(stderr, "META_END\n");
864
    return;
865
866
    case META_CAPTURE:
867
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
868
    break;
869
870
    case META_RECURSE:
871
    GETOFFSET(offset, pptr);
872
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
873
    break;
874
875
    case META_BACKREF:
876
    if (meta_arg < 10)
877
      offset = cb->small_ref_offset[meta_arg];
878
    else
879
      GETOFFSET(offset, pptr);
880
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
881
    break;
882
883
    case META_ESCAPE:
884
    if (meta_arg == ESC_P || meta_arg == ESC_p)
885
      {
886
      uint32_t ptype = *pptr >> 16;
887
      uint32_t pvalue = *pptr++ & 0xffff;
888
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
889
        ptype, pvalue);
890
      }
891
    else
892
      {
893
      uint32_t cc;
894
      /* There's just one escape we might have here that isn't negated in the
895
      escapes table. */
896
      if (meta_arg == ESC_g) cc = CHAR_g;
897
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
898
        {
899
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
900
        }
901
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
902
      fprintf(stderr, "META \\%c", cc);
903
      }
904
    break;
905
906
    case META_MINMAX:
907
    min = *pptr++;
908
    max = *pptr++;
909
    if (max != REPEAT_UNLIMITED)
910
      fprintf(stderr, "META {%d,%d}", min, max);
911
    else
912
      fprintf(stderr, "META {%d,}", min);
913
    break;
914
915
    case META_MINMAX_QUERY:
916
    min = *pptr++;
917
    max = *pptr++;
918
    if (max != REPEAT_UNLIMITED)
919
      fprintf(stderr, "META {%d,%d}?", min, max);
920
    else
921
      fprintf(stderr, "META {%d,}?", min);
922
    break;
923
924
    case META_MINMAX_PLUS:
925
    min = *pptr++;
926
    max = *pptr++;
927
    if (max != REPEAT_UNLIMITED)
928
      fprintf(stderr, "META {%d,%d}+", min, max);
929
    else
930
      fprintf(stderr, "META {%d,}+", min);
931
    break;
932
933
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
934
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
935
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
936
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
937
    case META_DOT: fprintf(stderr, "META_DOT"); break;
938
    case META_ASTERISK: fprintf(stderr, "META *"); break;
939
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
940
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
941
    case META_PLUS: fprintf(stderr, "META +"); break;
942
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
943
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
944
    case META_QUERY: fprintf(stderr, "META ?"); break;
945
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
946
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
947
948
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
949
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
950
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
951
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
952
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
953
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
954
    case META_KET: fprintf(stderr, "META )"); break;
955
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
956
957
    case META_CLASS: fprintf(stderr, "META ["); break;
958
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
959
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
960
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
961
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
962
963
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
964
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
965
966
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
967
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
968
969
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
970
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
971
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
972
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
973
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
974
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
975
976
    case META_OPTIONS:
977
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
978
    pptr += 2;
979
    break;
980
981
    case META_LOOKBEHIND:
982
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
983
    pptr += 2;
984
    break;
985
986
    case META_LOOKBEHIND_NA:
987
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
988
    pptr += 2;
989
    break;
990
991
    case META_LOOKBEHINDNOT:
992
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
993
    pptr += 2;
994
    break;
995
996
    case META_CALLOUT_NUMBER:
997
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
998
       pptr[1]);
999
    pptr += 3;
1000
    break;
1001
1002
    case META_CALLOUT_STRING:
1003
      {
1004
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1005
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1006
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1007
      GETOFFSET(offset, pptr);
1008
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1009
      }
1010
    break;
1011
1012
    case META_RECURSE_BYNAME:
1013
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1014
    GETOFFSET(offset, pptr);
1015
    fprintf(stderr, "%zd", offset);
1016
    break;
1017
1018
    case META_BACKREF_BYNAME:
1019
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1020
    GETOFFSET(offset, pptr);
1021
    fprintf(stderr, "%zd", offset);
1022
    break;
1023
1024
    case META_COND_NUMBER:
1025
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    pptr++;
1029
    break;
1030
1031
    case META_COND_DEFINE:
1032
    fprintf(stderr, "META (?(DEFINE) offset=");
1033
    GETOFFSET(offset, pptr);
1034
    fprintf(stderr, "%zd", offset);
1035
    break;
1036
1037
    case META_COND_VERSION:
1038
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1039
    fprintf(stderr, "%d.", *pptr++);
1040
    fprintf(stderr, "%d)", *pptr++);
1041
    break;
1042
1043
    case META_COND_NAME:
1044
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1045
    GETOFFSET(offset, pptr);
1046
    fprintf(stderr, "%zd", offset);
1047
    break;
1048
1049
    case META_COND_RNAME:
1050
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1051
    GETOFFSET(offset, pptr);
1052
    fprintf(stderr, "%zd", offset);
1053
    break;
1054
1055
    /* This is kept as a name, because it might be. */
1056
1057
    case META_COND_RNUMBER:
1058
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1059
    GETOFFSET(offset, pptr);
1060
    fprintf(stderr, "%zd", offset);
1061
    break;
1062
1063
    case META_OFFSET:
1064
    fprintf(stderr, "META_OFFSET offset=");
1065
    GETOFFSET(offset, pptr);
1066
    fprintf(stderr, "%zd", offset);
1067
    break;
1068
1069
    case META_SCS:
1070
    fprintf(stderr, "META (*scan_substring:");
1071
    break;
1072
1073
    case META_CAPTURE_NAME:
1074
    fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1075
    break;
1076
1077
    case META_CAPTURE_NUMBER:
1078
    fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1079
    break;
1080
1081
    case META_MARK:
1082
    fprintf(stderr, "META (*MARK:");
1083
    goto SHOWARG;
1084
1085
    case META_COMMIT_ARG:
1086
    fprintf(stderr, "META (*COMMIT:");
1087
    goto SHOWARG;
1088
1089
    case META_PRUNE_ARG:
1090
    fprintf(stderr, "META (*PRUNE:");
1091
    goto SHOWARG;
1092
1093
    case META_SKIP_ARG:
1094
    fprintf(stderr, "META (*SKIP:");
1095
    goto SHOWARG;
1096
1097
    case META_THEN_ARG:
1098
    fprintf(stderr, "META (*THEN:");
1099
    SHOWARG:
1100
    length = *pptr++;
1101
    for (i = 0; i < length; i++)
1102
      {
1103
      uint32_t cc = *pptr++;
1104
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1105
        else fprintf(stderr, "\\x{%x}", cc);
1106
      }
1107
    fprintf(stderr, ") length=%u", length);
1108
    break;
1109
1110
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1111
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1112
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1113
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1114
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1115
    }
1116
  fprintf(stderr, "\n");
1117
  }
1118
return;
1119
}
1120
#endif  /* DEBUG_SHOW_PARSED */
1121
1122
1123
1124
/*************************************************
1125
*               Copy compiled code               *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. */
1130
1131
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1132
pcre2_code_copy(const pcre2_code *code)
1133
0
{
1134
0
PCRE2_SIZE *ref_count;
1135
0
pcre2_code *newcode;
1136
1137
0
if (code == NULL) return NULL;
1138
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1139
0
if (newcode == NULL) return NULL;
1140
0
memcpy(newcode, code, code->blocksize);
1141
0
newcode->executable_jit = NULL;
1142
1143
/* If the code is one that has been deserialized, increment the reference count
1144
in the decoded tables. */
1145
1146
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1147
0
  {
1148
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1149
0
  (*ref_count)++;
1150
0
  }
1151
1152
0
return newcode;
1153
0
}
1154
1155
1156
1157
/*************************************************
1158
*     Copy compiled code and character tables    *
1159
*************************************************/
1160
1161
/* Compiled JIT code cannot be copied, so the new compiled block has no
1162
associated JIT data. This version of code_copy also makes a separate copy of
1163
the character tables. */
1164
1165
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1166
pcre2_code_copy_with_tables(const pcre2_code *code)
1167
0
{
1168
0
PCRE2_SIZE* ref_count;
1169
0
pcre2_code *newcode;
1170
0
uint8_t *newtables;
1171
1172
0
if (code == NULL) return NULL;
1173
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1174
0
if (newcode == NULL) return NULL;
1175
0
memcpy(newcode, code, code->blocksize);
1176
0
newcode->executable_jit = NULL;
1177
1178
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1179
0
  code->memctl.memory_data);
1180
0
if (newtables == NULL)
1181
0
  {
1182
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1183
0
  return NULL;
1184
0
  }
1185
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1186
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1187
0
*ref_count = 1;
1188
1189
0
newcode->tables = newtables;
1190
0
newcode->flags |= PCRE2_DEREF_TABLES;
1191
0
return newcode;
1192
0
}
1193
1194
1195
1196
/*************************************************
1197
*               Free compiled code               *
1198
*************************************************/
1199
1200
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1201
pcre2_code_free(pcre2_code *code)
1202
64.8k
{
1203
64.8k
PCRE2_SIZE* ref_count;
1204
1205
64.8k
if (code != NULL)
1206
54.9k
  {
1207
54.9k
#ifdef SUPPORT_JIT
1208
54.9k
  if (code->executable_jit != NULL)
1209
50.4k
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1210
54.9k
#endif
1211
1212
54.9k
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1213
0
    {
1214
    /* Decoded tables belong to the codes after deserialization, and they must
1215
    be freed when there are no more references to them. The *ref_count should
1216
    always be > 0. */
1217
1218
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1219
0
    if (*ref_count > 0)
1220
0
      {
1221
0
      (*ref_count)--;
1222
0
      if (*ref_count == 0)
1223
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1224
0
      }
1225
0
    }
1226
1227
54.9k
  code->memctl.free(code, code->memctl.memory_data);
1228
54.9k
  }
1229
64.8k
}
1230
1231
1232
1233
/*************************************************
1234
*         Read a number, possibly signed         *
1235
*************************************************/
1236
1237
/* This function is used to read numbers in the pattern. The initial pointer
1238
must be at the sign or first digit of the number. When relative values
1239
(introduced by + or -) are allowed, they are relative group numbers, and the
1240
result must be greater than zero.
1241
1242
Arguments:
1243
  ptrptr      points to the character pointer variable
1244
  ptrend      points to the end of the input string
1245
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1246
  max_value   the largest number allowed;
1247
              you must not pass a value for max_value larger than
1248
              INT_MAX/10 - 1 because this function relies on max_value to
1249
              avoid integer overflow
1250
  max_error   the error to give for an over-large number
1251
  intptr      where to put the result
1252
  errcodeptr  where to put an error code
1253
1254
Returns:      TRUE  - a number was read
1255
              FALSE - errorcode == 0 => no number was found
1256
                      errorcode != 0 => an error occurred
1257
*/
1258
1259
static BOOL
1260
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1261
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1262
740k
{
1263
740k
int sign = 0;
1264
740k
uint32_t n = 0;
1265
740k
PCRE2_SPTR ptr = *ptrptr;
1266
740k
BOOL yield = FALSE;
1267
1268
740k
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1269
1270
740k
*errorcodeptr = 0;
1271
1272
740k
if (allow_sign >= 0 && ptr < ptrend)
1273
38.6k
  {
1274
38.6k
  if (*ptr == CHAR_PLUS)
1275
30.6k
    {
1276
30.6k
    sign = +1;
1277
30.6k
    max_value -= allow_sign;
1278
30.6k
    ptr++;
1279
30.6k
    }
1280
7.98k
  else if (*ptr == CHAR_MINUS)
1281
115
    {
1282
115
    sign = -1;
1283
115
    ptr++;
1284
115
    }
1285
38.6k
  }
1286
1287
740k
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1288
1.26M
while (ptr < ptrend && IS_DIGIT(*ptr))
1289
717k
  {
1290
717k
  n = n * 10 + (*ptr++ - CHAR_0);
1291
717k
  if (n > max_value)
1292
2.05k
    {
1293
2.05k
    *errorcodeptr = max_error;
1294
21.0k
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1295
2.05k
    goto EXIT;
1296
2.05k
    }
1297
717k
  }
1298
1299
547k
if (allow_sign >= 0 && sign != 0)
1300
30.7k
  {
1301
30.7k
  if (n == 0)
1302
10
    {
1303
10
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1304
10
    goto EXIT;
1305
10
    }
1306
1307
30.7k
  if (sign > 0) n += allow_sign;
1308
108
  else if (n > (uint32_t)allow_sign)
1309
16
    {
1310
16
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1311
16
    goto EXIT;
1312
16
    }
1313
92
  else n = allow_sign + 1 - n;
1314
30.7k
  }
1315
1316
547k
yield = TRUE;
1317
1318
549k
EXIT:
1319
549k
*intptr = n;
1320
549k
*ptrptr = ptr;
1321
549k
return yield;
1322
547k
}
1323
1324
1325
1326
/*************************************************
1327
*         Read repeat counts                     *
1328
*************************************************/
1329
1330
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1331
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1332
larger value is used for "unlimited". We have to use signed arguments for
1333
read_number() because it is capable of returning a signed value. As of Perl
1334
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1335
tabs after { and before } and between the numbers and the comma, so we do too.
1336
1337
Arguments:
1338
  ptrptr         points to pointer to character after '{'
1339
  ptrend         pointer to end of input
1340
  minp           if not NULL, pointer to int for min
1341
  maxp           if not NULL, pointer to int for max
1342
  errorcodeptr   points to error code variable
1343
1344
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1345
                 FALSE on error, with errorcode set non-zero
1346
                 TRUE on success, with pointer updated to point after '}'
1347
*/
1348
1349
static BOOL
1350
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1351
  uint32_t *maxp, int *errorcodeptr)
1352
740k
{
1353
740k
PCRE2_SPTR p = *ptrptr;
1354
740k
PCRE2_SPTR pp;
1355
740k
BOOL yield = FALSE;
1356
740k
BOOL had_minimum = FALSE;
1357
740k
int32_t min = 0;
1358
740k
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1359
1360
740k
*errorcodeptr = 0;
1361
746k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1362
1363
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1364
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1365
error. */
1366
1367
740k
pp = p;
1368
740k
if (pp < ptrend && IS_DIGIT(*pp))
1369
395k
  {
1370
395k
  had_minimum = TRUE;
1371
567k
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1372
395k
  }
1373
1374
751k
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1375
740k
if (pp >= ptrend) return FALSE;
1376
1377
738k
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1378
166k
  {
1379
166k
  if (!had_minimum) return FALSE;
1380
166k
  }
1381
572k
else
1382
572k
  {
1383
572k
  if (*pp++ != CHAR_COMMA) return FALSE;
1384
312k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1385
309k
  if (pp >= ptrend) return FALSE;
1386
308k
  if (IS_DIGIT(*pp))
1387
220k
    {
1388
266k
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1389
220k
    }
1390
88.2k
  else if (!had_minimum) return FALSE;
1391
311k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1392
296k
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1393
296k
  }
1394
1395
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1396
or {n,m}. The only error that read_number() can return is for a number that is
1397
too big. If *errorcodeptr is returned as zero it means no number was found. */
1398
1399
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1400
check m >= n because n defaults to zero. */
1401
1402
412k
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1403
139k
  {
1404
139k
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1405
138k
  p++;  /* Skip comma and subsequent spaces */
1406
140k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1407
138k
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1408
31
    {
1409
31
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1410
31
    }
1411
138k
  }
1412
1413
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1414
1415
273k
else
1416
273k
  {
1417
274k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1418
273k
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1419
161k
    {
1420
161k
    max = min;
1421
161k
    }
1422
111k
  else   /* Handle {n,} or {n,m} */
1423
111k
    {
1424
111k
    p++;    /* Skip comma and subsequent spaces */
1425
112k
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1426
111k
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1427
46.2k
      {
1428
46.2k
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1429
46.2k
      }
1430
1431
111k
    if (max < min)
1432
17
      {
1433
17
      *errorcodeptr = ERR4;
1434
17
      goto EXIT;
1435
17
      }
1436
111k
    }
1437
273k
  }
1438
1439
/* Valid quantifier exists */
1440
1441
423k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1442
412k
p++;
1443
412k
yield = TRUE;
1444
412k
if (minp != NULL) *minp = (uint32_t)min;
1445
412k
if (maxp != NULL) *maxp = (uint32_t)max;
1446
1447
/* Update the pattern pointer */
1448
1449
412k
EXIT:
1450
412k
*ptrptr = p;
1451
412k
return yield;
1452
412k
}
1453
1454
1455
1456
/*************************************************
1457
*            Handle escapes                      *
1458
*************************************************/
1459
1460
/* This function is called when a \ has been encountered. It either returns a
1461
positive value for a simple escape such as \d, or 0 for a data character, which
1462
is placed in chptr. A backreference to group n is returned as -(n+1). On
1463
entry, ptr is pointing at the character after \. On exit, it points after the
1464
final code unit of the escape sequence.
1465
1466
This function is also called from pcre2_substitute() to handle escape sequences
1467
in replacement strings. In this case, the cb argument is NULL, and in the case
1468
of escapes that have further processing, only sequences that define a data
1469
character are recognised. The options argument is the final value of the
1470
compiled pattern's options.
1471
1472
Arguments:
1473
  ptrptr         points to the input position pointer
1474
  ptrend         points to the end of the input
1475
  chptr          points to a returned data character
1476
  errorcodeptr   points to the errorcode variable (containing zero)
1477
  options        the current options bits
1478
  xoptions       the current extra options bits
1479
  bracount       the number of capturing parentheses encountered so far
1480
  isclass        TRUE if in a character class
1481
  cb             compile data block or NULL when called from pcre2_substitute()
1482
1483
Returns:         zero => a data character
1484
                 positive => a special escape sequence
1485
                 negative => a numerical back reference
1486
                 on error, errorcodeptr is set non-zero
1487
*/
1488
1489
int
1490
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1491
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1492
  BOOL isclass, compile_block *cb)
1493
1.26M
{
1494
1.26M
BOOL utf = (options & PCRE2_UTF) != 0;
1495
1.26M
BOOL alt_bsux =
1496
1.26M
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1497
1.26M
PCRE2_SPTR ptr = *ptrptr;
1498
1.26M
uint32_t c, cc;
1499
1.26M
int escape = 0;
1500
1.26M
int i;
1501
1502
/* If backslash is at the end of the string, it's an error. */
1503
1504
1.26M
if (ptr >= ptrend)
1505
35
  {
1506
35
  *errorcodeptr = ERR1;
1507
35
  return 0;
1508
35
  }
1509
1510
1.26M
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1511
1.26M
*errorcodeptr = 0;              /* Be optimistic */
1512
1513
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1514
value test saves a memory lookup for code points outside the alphanumeric
1515
range. */
1516
1517
1.26M
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1518
1519
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1520
positive value is a literal value for something like \n. A negative value is
1521
the negation of one of the ESC_ macros that is passed back for handling by the
1522
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1523
is supported. If the value is zero, further processing is handled below. */
1524
1525
1.02M
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1526
961k
  {
1527
961k
  if (i > 0)
1528
138k
    {
1529
138k
    c = (uint32_t)i;
1530
138k
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1531
0
      c = CHAR_LF;
1532
138k
    }
1533
822k
  else  /* Negative table entry */
1534
822k
    {
1535
822k
    escape = -i;                    /* Else return a special escape */
1536
822k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1537
53.0k
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1538
1539
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1540
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1541
    support \N{name}. However, it does support quantification such as \N{2,3},
1542
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1543
1544
822k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1545
1.34k
      {
1546
1.34k
      PCRE2_SPTR p = ptr + 1;
1547
1548
      /* Perl ignores spaces and tabs after { */
1549
1550
1.97k
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1551
1552
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1553
      not valid in EBCDIC environments because it specifies a Unicode
1554
      character, not a codepoint in the local code. For example \N{U+0041}
1555
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1556
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1557
      Unicode) mode. */
1558
1559
1.34k
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1560
7
        {
1561
7
#ifndef EBCDIC
1562
7
        if (utf)
1563
3
          {
1564
3
          ptr = p + 2;
1565
3
          escape = 0;   /* Not a fancy escape after all */
1566
3
          goto COME_FROM_NU;
1567
3
          }
1568
4
#endif
1569
4
        *errorcodeptr = ERR93;
1570
4
        }
1571
1572
      /* Give an error in contexts where quantifiers are not allowed
1573
      (character classes; substitution strings). */
1574
1575
1.33k
      else if (isclass || cb == NULL)
1576
3
        {
1577
3
        *errorcodeptr = ERR37;
1578
3
        }
1579
1580
      /* Give an error if what follows is not a quantifier, but don't override
1581
      an error set by the quantifier reader (e.g. number overflow). */
1582
1583
1.33k
      else
1584
1.33k
        {
1585
1.33k
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1586
1.33k
             *errorcodeptr == 0)
1587
50
          *errorcodeptr = ERR37;
1588
1.33k
        }
1589
1.34k
      }
1590
822k
    }
1591
961k
  }
1592
1593
/* Escapes that need further processing, including those that are unknown, have
1594
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1595
\o, and \x are recognized (\u and \U can never appear as they are used for case
1596
forcing). */
1597
1598
59.9k
else
1599
59.9k
  {
1600
59.9k
  int s;
1601
59.9k
  PCRE2_SPTR oldptr;
1602
59.9k
  BOOL overflow;
1603
1604
  /* Filter calls from pcre2_substitute(). */
1605
1606
59.9k
  if (cb == NULL)
1607
0
    {
1608
0
    if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&
1609
0
        c != CHAR_x && c != CHAR_g)
1610
0
      {
1611
0
      *errorcodeptr = ERR3;
1612
0
      return 0;
1613
0
      }
1614
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1615
0
    }
1616
1617
59.9k
  switch (c)
1618
59.9k
    {
1619
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1620
    error. */
1621
1622
7
    case CHAR_F:
1623
10
    case CHAR_l:
1624
13
    case CHAR_L:
1625
13
    *errorcodeptr = ERR37;
1626
13
    break;
1627
1628
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1629
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1630
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1631
    Otherwise it is a lowercase u letter. This gives some compatibility with
1632
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1633
    allowed. When \u{ is not followed by hex digits, a special return is given
1634
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1635
1636
5.82k
    case CHAR_u:
1637
5.82k
    if (!alt_bsux) *errorcodeptr = ERR37; else
1638
5.68k
      {
1639
5.68k
      uint32_t xc;
1640
1641
5.68k
      if (ptr >= ptrend) break;
1642
5.68k
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1643
5.68k
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1644
0
        {
1645
0
        PCRE2_SPTR hptr = ptr + 1;
1646
1647
0
        cc = 0;
1648
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1649
0
          {
1650
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1651
0
            {
1652
0
            *errorcodeptr = ERR77;
1653
0
            ptr = hptr;   /* Show where */
1654
0
            break;        /* *hptr != } will cause another break below */
1655
0
            }
1656
0
          cc = (cc << 4) | xc;
1657
0
          hptr++;
1658
0
          }
1659
1660
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1661
0
            hptr >= ptrend ||    /* Hit end of input */
1662
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1663
0
          {
1664
0
          if (isclass) break; /* In a class, just treat as '\u' literal */
1665
0
          escape = ESC_ub;    /* Special return */
1666
0
          ptr++;              /* Skip { */
1667
0
          break;              /* Hex escape not recognized */
1668
0
          }
1669
1670
0
        c = cc;          /* Accept the code point */
1671
0
        ptr = hptr + 1;
1672
0
        }
1673
1674
5.68k
      else  /* Must be exactly 4 hex digits */
1675
5.68k
        {
1676
5.68k
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1677
5.66k
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1678
5.00k
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1679
4.12k
        cc = (cc << 4) | xc;
1680
4.12k
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1681
3.53k
        cc = (cc << 4) | xc;
1682
3.53k
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1683
3.12k
        c = (cc << 4) | xc;
1684
3.12k
        ptr += 4;
1685
3.12k
        }
1686
1687
3.12k
      if (utf)
1688
2.91k
        {
1689
2.91k
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1690
2.91k
        else
1691
2.91k
          if (c >= 0xd800 && c <= 0xdfff &&
1692
2.91k
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1693
1
                *errorcodeptr = ERR73;
1694
2.91k
        }
1695
204
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1696
3.12k
      }
1697
3.25k
    break;
1698
1699
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1700
    in which case it is an upper case letter. */
1701
1702
3.25k
    case CHAR_U:
1703
801
    if (!alt_bsux) *errorcodeptr = ERR37;
1704
801
    break;
1705
1706
    /* In a character class, \g is just a literal "g". Outside a character
1707
    class, \g must be followed by one of a number of specific things:
1708
1709
    (1) A number, either plain or braced. If positive, it is an absolute
1710
    backreference. If negative, it is a relative backreference. This is a Perl
1711
    5.10 feature.
1712
1713
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1714
    is part of Perl's movement towards a unified syntax for back references. As
1715
    this is synonymous with \k{name}, we fudge it up by pretending it really
1716
    was \k{name}.
1717
1718
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1719
    number either in angle brackets or in single quotes. However, these are
1720
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1721
    the ESC_g code.
1722
1723
    Summary: Return a negative number for a numerical back reference (offset
1724
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1725
    numbered subroutine call.
1726
1727
    The above describes the \g behaviour inside patterns. Inside replacement
1728
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1729
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1730
    numbered case.
1731
    */
1732
1733
12.5k
    case CHAR_g:
1734
12.5k
    if (isclass) break;
1735
1736
12.2k
    if (ptr >= ptrend)
1737
3
      {
1738
3
      *errorcodeptr = ERR57;
1739
3
      break;
1740
3
      }
1741
1742
12.2k
    if (cb == NULL)
1743
0
      {
1744
0
      PCRE2_SPTR p;
1745
      /* Substitution strings */
1746
0
      if (*ptr != CHAR_LESS_THAN_SIGN)
1747
0
        {
1748
0
        *errorcodeptr = ERR57;
1749
0
        break;
1750
0
        }
1751
1752
0
      p = ptr + 1;
1753
1754
0
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1755
0
          errorcodeptr))
1756
0
        {
1757
0
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1758
0
        break;
1759
0
        }
1760
1761
0
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1762
0
        {
1763
        /* not advancing ptr; report error at the \g character */
1764
0
        *errorcodeptr = ERR57;
1765
0
        break;
1766
0
        }
1767
1768
      /* This is the reason that back references are returned as -(s+1) rather
1769
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1770
      valid in a substitution string, so this must be representable. */
1771
0
      ptr = p + 1;
1772
0
      escape = -(s+1);
1773
0
      break;
1774
0
      }
1775
1776
12.2k
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1777
252
      {
1778
252
      escape = ESC_g;
1779
252
      break;
1780
252
      }
1781
1782
    /* If there is a brace delimiter, try to read a numerical reference. If
1783
    there isn't one, assume we have a name and treat it as \k. */
1784
1785
11.9k
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1786
200
      {
1787
200
      PCRE2_SPTR p = ptr + 1;
1788
1789
896
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1790
200
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1791
200
          errorcodeptr))
1792
100
        {
1793
100
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1794
100
        break;
1795
100
        }
1796
549
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1797
1798
100
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1799
29
        {
1800
        /* not advancing ptr; report error at the \g character */
1801
29
        *errorcodeptr = ERR57;
1802
29
        break;
1803
29
        }
1804
71
      ptr = p + 1;
1805
71
      }
1806
1807
    /* Read an undelimited number */
1808
1809
11.7k
    else
1810
11.7k
      {
1811
11.7k
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1812
11.7k
          errorcodeptr))
1813
14
        {
1814
14
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1815
14
        break;
1816
14
        }
1817
11.7k
      }
1818
1819
11.8k
    if (s <= 0)
1820
3
      {
1821
3
      *errorcodeptr = ERR15;
1822
3
      break;
1823
3
      }
1824
1825
11.8k
    escape = -(s+1);
1826
11.8k
    break;
1827
1828
    /* The handling of escape sequences consisting of a string of digits
1829
    starting with one that is not zero is not straightforward. Perl has changed
1830
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1831
    recommended to avoid the ambiguities in the old syntax.
1832
1833
    Outside a character class, the digits are read as a decimal number. If the
1834
    number is less than 10, or if there are that many previous extracting left
1835
    brackets, it is a back reference. Otherwise, up to three octal digits are
1836
    read to form an escaped character code. Thus \123 is likely to be octal 123
1837
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1838
    style" of handling ambiguous octal/backrefences such as \12.
1839
1840
    There is an alternative disambiguation strategy, selected by
1841
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1842
    have either a leading zero, or exactly three octal digits; otherwise it's
1843
    a backreference. The disambiguation is stable, and does not depend on how
1844
    many capture groups are defined (it's simply an invalid backreference if
1845
    there is no corresponding capture group). Additionally, octal values above
1846
    \377 (\xff) are rejected.
1847
1848
    Inside a character class, \ followed by a digit is always either a literal
1849
    8 or 9 or an octal number. */
1850
1851
25.6k
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1852
28.0k
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1853
1854
28.0k
    if (isclass)
1855
6.26k
      {
1856
      /* Fall through to octal handling; never a backreference inside a class. */
1857
6.26k
      }
1858
21.7k
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1859
0
      {
1860
      /* Python-style disambiguation. */
1861
0
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1862
0
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1863
0
        {
1864
        /* We peeked a three-digit octal, so fall through */
1865
0
        }
1866
0
      else
1867
0
        {
1868
        /* We are at a digit, so the only possible error from read_number() is
1869
        a number that is too large. */
1870
0
        ptr--;   /* Back to the digit */
1871
1872
0
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1873
0
          {
1874
0
          *errorcodeptr = ERR61;
1875
0
          break;
1876
0
          }
1877
1878
0
        escape = -(s+1);
1879
0
        break;
1880
0
        }
1881
0
      }
1882
21.7k
    else
1883
21.7k
      {
1884
      /* Perl-style disambiguation. */
1885
21.7k
      oldptr = ptr;
1886
21.7k
      ptr--;   /* Back to the digit */
1887
1888
      /* As we know we are at a digit, the only possible error from
1889
      read_number() is a number that is too large to be a group number. Because
1890
      that number might be still valid if read as an octal, errorcodeptr is not
1891
      set on failure and therefore a sentinel value of INT_MAX is used instead
1892
      of the original value, and will be used later to properly set the error,
1893
      if not falling through. */
1894
1895
21.7k
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1896
1.90k
        s = INT_MAX;
1897
1898
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1899
      are octal escapes if there are not that many previous captures. */
1900
1901
21.7k
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1902
18.6k
        {
1903
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1904
        but we keep it just to be safe and because it will also catch the
1905
        sentinel value that was set on failure by that function. */
1906
1907
18.6k
        if ((unsigned)s > MAX_GROUP_NUMBER)
1908
7
          {
1909
7
          PCRE2_ASSERT(s == INT_MAX);
1910
7
          *errorcodeptr = ERR61;
1911
7
          }
1912
18.5k
        else escape = -(s+1);     /* Indicates a back reference */
1913
18.6k
        break;
1914
18.6k
        }
1915
1916
3.14k
      ptr = oldptr;      /* Put the pointer back and fall through */
1917
3.14k
      }
1918
1919
    /* Handle a digit following \ when the number is not a back reference, or
1920
    we are within a character class. If the first digit is 8 or 9, Perl used to
1921
    generate a binary zero and then treat the digit as a following literal. At
1922
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1923
1924
9.41k
    if (c >= CHAR_8) break;
1925
1926
    /* Fall through */
1927
1928
    /* \0 always starts an octal number, but we may drop through to here with a
1929
    larger first octal digit. The original code used just to take the least
1930
    significant 8 bits of octal numbers (I think this is what early Perls used
1931
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1932
    but no more than 3 octal digits. */
1933
1934
16.5k
    case CHAR_0:
1935
16.5k
    c -= CHAR_0;
1936
22.7k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1937
6.23k
        c = c * 8 + *ptr++ - CHAR_0;
1938
16.5k
    if (c > 0xff)
1939
371
      {
1940
371
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1941
371
#if PCRE2_CODE_UNIT_WIDTH == 8
1942
371
      else if (!utf) *errorcodeptr = ERR51;
1943
371
#endif
1944
371
      }
1945
1946
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1947
    two- or three-character octal escapes \00 and \000, nor \x00. */
1948
1949
16.5k
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1950
0
        *errorcodeptr = ERR98;
1951
16.5k
    break;
1952
1953
    /* \o is a relatively new Perl feature, supporting a more general way of
1954
    specifying character codes in octal. The only supported form is \o{ddd},
1955
    with optional spaces or tabs after { and before }. */
1956
1957
586
    case CHAR_o:
1958
586
    if (ptr >= ptrend || *ptr++ != CHAR_LEFT_CURLY_BRACKET)
1959
10
      {
1960
10
      ptr--;
1961
10
      *errorcodeptr = ERR55;
1962
10
      break;
1963
10
      }
1964
1965
1.14k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1966
576
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1967
15
      {
1968
15
      *errorcodeptr = ERR78;
1969
15
      break;
1970
15
      }
1971
1972
561
    c = 0;
1973
561
    overflow = FALSE;
1974
1.37k
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1975
843
      {
1976
843
      cc = *ptr++;
1977
843
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1978
#if PCRE2_CODE_UNIT_WIDTH == 32
1979
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1980
#endif
1981
467
      c = (c << 3) + (cc - CHAR_0);
1982
467
#if PCRE2_CODE_UNIT_WIDTH == 8
1983
467
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1984
#elif PCRE2_CODE_UNIT_WIDTH == 16
1985
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1986
#elif PCRE2_CODE_UNIT_WIDTH == 32
1987
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
1988
#endif
1989
467
      }
1990
1991
1.14k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1992
1993
561
    if (overflow)
1994
30
      {
1995
834
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
1996
30
      *errorcodeptr = ERR34;
1997
30
      }
1998
531
    else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
1999
469
      {
2000
469
      if (utf && c >= 0xd800 && c <= 0xdfff &&
2001
469
          (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2002
0
        {
2003
0
        ptr--;
2004
0
        *errorcodeptr = ERR73;
2005
0
        }
2006
469
      }
2007
62
    else
2008
62
      {
2009
62
      ptr--;
2010
62
      *errorcodeptr = ERR64;
2011
62
      }
2012
561
    break;
2013
2014
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
2015
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
2016
2017
3.45k
    case CHAR_x:
2018
3.45k
    if (alt_bsux)
2019
1.38k
      {
2020
1.38k
      uint32_t xc;
2021
1.38k
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
2022
1.38k
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
2023
744
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2024
494
      c = (cc << 4) | xc;
2025
494
      ptr += 2;
2026
494
      }
2027
2028
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
2029
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2030
    digits. If not, { used to be treated as a data character. However, Perl
2031
    seems to read hex digits up to the first non-such, and ignore the rest, so
2032
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2033
    now gives an error. */
2034
2035
2.07k
    else
2036
2.07k
      {
2037
2.07k
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2038
552
        {
2039
552
        ptr++;
2040
2.00k
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2041
2042
552
#ifndef EBCDIC
2043
555
        COME_FROM_NU:
2044
555
#endif
2045
555
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2046
15
          {
2047
15
          *errorcodeptr = ERR78;
2048
15
          break;
2049
15
          }
2050
540
        c = 0;
2051
540
        overflow = FALSE;
2052
2053
1.45k
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2054
936
          {
2055
936
          ptr++;
2056
936
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2057
#if PCRE2_CODE_UNIT_WIDTH == 32
2058
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2059
#endif
2060
625
          c = (c << 4) | cc;
2061
625
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2062
25
            {
2063
25
            overflow = TRUE;
2064
25
            break;
2065
25
            }
2066
625
          }
2067
2068
        /* Perl ignores spaces and tabs before } */
2069
2070
2.10k
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2071
2072
        /* On overflow, skip remaining hex digits */
2073
2074
540
        if (overflow)
2075
25
          {
2076
497
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2077
25
          *errorcodeptr = ERR34;
2078
25
          }
2079
515
        else if (ptr < ptrend && *ptr++ == CHAR_RIGHT_CURLY_BRACKET)
2080
471
          {
2081
471
          if (utf && c >= 0xd800 && c <= 0xdfff &&
2082
471
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2083
0
            {
2084
0
            ptr--;
2085
0
            *errorcodeptr = ERR73;
2086
0
            }
2087
471
          }
2088
2089
        /* If the sequence of hex digits (followed by optional space) does not
2090
        end with '}', give an error. We used just to recognize this construct
2091
        and fall through to the normal \x handling, but nowadays Perl gives an
2092
        error, which seems much more sensible, so we do too. */
2093
2094
44
        else
2095
44
          {
2096
44
          ptr--;
2097
44
          *errorcodeptr = ERR67;
2098
44
          }
2099
540
        }   /* End of \x{} processing */
2100
2101
      /* Read a up to two hex digits after \x */
2102
2103
1.52k
      else
2104
1.52k
        {
2105
        /* Perl has the surprising/broken behaviour that \x without following
2106
        hex digits is treated as an escape for NUL. Their source code laments
2107
        this but keeps it for backwards compatibility. A warning is printed
2108
        when "use warnings" is enabled. Because we don't have warnings, we
2109
        simply forbid it. */
2110
1.52k
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2111
48
          {
2112
          /* Not a hex digit */
2113
48
          *errorcodeptr = ERR78;
2114
48
          break;
2115
48
          }
2116
1.47k
        ptr++;
2117
1.47k
        c = cc;
2118
2119
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2120
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2121
        strict, and there seems little incentive to align with that, given the
2122
        backwards-compatibility cost.
2123
2124
        For comparison, note that other engines disagree. For example:
2125
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2126
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2127
        */
2128
1.47k
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2129
1.02k
        ptr++;
2130
1.02k
        c = (c << 4) | cc;
2131
1.02k
        }     /* End of \xdd handling */
2132
2.07k
      }       /* End of Perl-style \x handling */
2133
2.06k
    break;
2134
2135
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2136
    ASCII (or Unicode) environment, an error is given if the character
2137
    following \c is not a printable ASCII character. Otherwise, the following
2138
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2139
    flipped. The result is the value of the escape.
2140
2141
    In an EBCDIC environment the handling of \c is compatible with the
2142
    specification in the perlebcdic document. The following character must be
2143
    a letter or one of small number of special characters. These provide a
2144
    means of defining the character values 0-31.
2145
2146
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2147
    the EBCDIC value of 'c' explicitly. */
2148
2149
2.06k
    case CHAR_c:
2150
1.32k
    if (ptr >= ptrend)
2151
3
      {
2152
3
      *errorcodeptr = ERR2;
2153
3
      break;
2154
3
      }
2155
1.31k
    c = *ptr;
2156
1.31k
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2157
2158
    /* Handle \c in an ASCII/Unicode environment. */
2159
2160
1.31k
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2161
1.31k
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2162
6
      {
2163
6
      *errorcodeptr = ERR68;
2164
6
      break;
2165
6
      }
2166
1.31k
    c ^= 0x40;
2167
2168
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2169
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2170
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2171
    The other valid sequences correspond to a list of specific characters. */
2172
2173
#else
2174
    if (c == CHAR_QUESTION_MARK)
2175
      c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;
2176
    else
2177
      {
2178
      for (i = 0; i < 32; i++)
2179
        {
2180
        if (c == ebcdic_escape_c[i]) break;
2181
        }
2182
      if (i < 32) c = i; else *errorcodeptr = ERR68;
2183
      }
2184
#endif  /* EBCDIC */
2185
2186
1.31k
    ptr++;
2187
1.31k
    break;
2188
2189
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2190
    if in warning mode, but PCRE doesn't have a warning mode. */
2191
2192
6
    default:
2193
6
    *errorcodeptr = ERR3;
2194
6
    *ptrptr = ptr - 1;     /* Point to the character at fault */
2195
6
    return 0;
2196
59.9k
    }
2197
59.9k
  }
2198
2199
/* Set the pointer to the next character before returning. */
2200
2201
1.26M
*ptrptr = ptr;
2202
1.26M
*chptr = c;
2203
1.26M
return escape;
2204
1.26M
}
2205
2206
2207
2208
#ifdef SUPPORT_UNICODE
2209
/*************************************************
2210
*               Handle \P and \p                 *
2211
*************************************************/
2212
2213
/* This function is called after \P or \p has been encountered, provided that
2214
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2215
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2216
after the final code unit of the escape sequence.
2217
2218
Arguments:
2219
  ptrptr         the pattern position pointer
2220
  negptr         a boolean that is set TRUE for negation else FALSE
2221
  ptypeptr       an unsigned int that is set to the type value
2222
  pdataptr       an unsigned int that is set to the detailed property value
2223
  errorcodeptr   the error code variable
2224
  cb             the compile data
2225
2226
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2227
*/
2228
2229
static BOOL
2230
get_ucp(PCRE2_SPTR *ptrptr, BOOL *negptr, uint16_t *ptypeptr,
2231
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2232
42.2k
{
2233
42.2k
PCRE2_UCHAR c;
2234
42.2k
PCRE2_SIZE i, bot, top;
2235
42.2k
PCRE2_SPTR ptr = *ptrptr;
2236
42.2k
PCRE2_UCHAR name[50];
2237
42.2k
PCRE2_UCHAR *vptr = NULL;
2238
42.2k
uint16_t ptscript = PT_NOTSCRIPT;
2239
2240
42.2k
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2241
42.2k
c = *ptr++;
2242
42.2k
*negptr = FALSE;
2243
2244
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2245
negation. We must be handling Unicode encoding here, though we may be compiling
2246
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2247
input and Unicode input in the same build.) In accordance with Unicode's "loose
2248
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2249
don't use isspace() or tolower() because (a) code points may be greater than
2250
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2251
environment. */
2252
2253
42.2k
if (c == CHAR_LEFT_CURLY_BRACKET)
2254
36.1k
  {
2255
36.1k
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2256
2257
132k
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2258
132k
    {
2259
133k
    REDO:
2260
2261
133k
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2262
133k
    c = *ptr++;
2263
2264
    /* Skip ignorable Unicode characters. */
2265
2266
134k
    while (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2267
134k
          (c >= CHAR_HT && c <= CHAR_CR))
2268
1.48k
      {
2269
1.48k
      if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2270
1.47k
      c = *ptr++;
2271
1.47k
      }
2272
2273
    /* The first significant character being circumflex negates the meaning of
2274
    the item. */
2275
2276
133k
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2277
868
      {
2278
868
      *negptr = TRUE;
2279
868
      goto REDO;
2280
868
      }
2281
2282
132k
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2283
2284
    /* Names consist of ASCII letters and digits, but equals and colon may also
2285
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2286
    check for a value between '&' and 'z' suffices because anything else in a
2287
    name or value will cause an "unknown property" error anyway. */
2288
2289
96.2k
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2290
2291
    /* Lower case a capital letter or remember where the name/value separator
2292
    is. */
2293
2294
96.2k
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2295
49.8k
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2296
2.59k
      vptr = name + i;
2297
2298
96.2k
    name[i] = c;
2299
96.2k
    }
2300
2301
  /* Error if the loop didn't end with '}' - either we hit the end of the
2302
  pattern or the name was longer than any legal property name. */
2303
2304
36.0k
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2305
36.0k
  name[i] = 0;
2306
36.0k
  }
2307
2308
/* If { doesn't follow \p or \P there is just one following character, which
2309
must be an ASCII letter. */
2310
2311
6.12k
else if (c >= CHAR_A && c <= CHAR_Z)
2312
3.24k
  {
2313
3.24k
  name[0] = c | 0x20;  /* Lower case */
2314
3.24k
  name[1] = 0;
2315
3.24k
  }
2316
2.87k
else if (c >= CHAR_a && c <= CHAR_z)
2317
2.85k
  {
2318
2.85k
  name[0] = c;
2319
2.85k
  name[1] = 0;
2320
2.85k
  }
2321
21
else goto ERROR_RETURN;
2322
2323
42.1k
*ptrptr = ptr;   /* Update pattern pointer */
2324
2325
/* If the property contains ':' or '=' we have class name and value separately
2326
specified. The following are supported:
2327
2328
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2329
  . Script (synonym sc) for which the property name is the script name
2330
  . Script_Extensions (synonym scx), ditto
2331
2332
As this is a small number, we currently just check the names directly. If this
2333
grows, a sorted table and a switch will be neater.
2334
2335
For both the script properties, set a PT_xxx value so that (1) they can be
2336
distinguished and (2) invalid script names that happen to be the name of
2337
another property can be diagnosed. */
2338
2339
42.1k
if (vptr != NULL)
2340
2.56k
  {
2341
2.56k
  int offset = 0;
2342
2.56k
  PCRE2_UCHAR sname[8];
2343
2344
2.56k
  *vptr = 0;   /* Terminate property name */
2345
2.56k
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2346
2.56k
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2347
2.39k
    {
2348
2.39k
    offset = 4;
2349
2.39k
    sname[0] = CHAR_b;
2350
2.39k
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2351
2.39k
    sname[2] = CHAR_d;
2352
2.39k
    sname[3] = CHAR_i;
2353
2.39k
    }
2354
2355
174
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2356
174
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2357
89
    ptscript = PT_SC;
2358
2359
85
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2360
85
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2361
74
    ptscript = PT_SCX;
2362
2363
11
  else
2364
11
    {
2365
11
    *errorcodeptr = ERR47;
2366
11
    return FALSE;
2367
11
    }
2368
2369
  /* Adjust the string in name[] as needed */
2370
2371
2.55k
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2372
2.55k
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2373
2.55k
  }
2374
2375
/* Search for a recognized property using binary chop. */
2376
2377
42.1k
bot = 0;
2378
42.1k
top = PRIV(utt_size);
2379
2380
316k
while (bot < top)
2381
316k
  {
2382
316k
  int r;
2383
316k
  i = (bot + top) >> 1;
2384
316k
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2385
2386
  /* When a matching property is found, some extra checking is needed when the
2387
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2388
2389
316k
  if (r == 0)
2390
42.0k
    {
2391
42.0k
    *pdataptr = PRIV(utt)[i].value;
2392
42.0k
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2393
41.9k
      {
2394
41.9k
      *ptypeptr = PRIV(utt)[i].type;
2395
41.9k
      return TRUE;
2396
41.9k
      }
2397
2398
158
    switch (PRIV(utt)[i].type)
2399
158
      {
2400
71
      case PT_SC:
2401
71
      *ptypeptr = PT_SC;
2402
71
      return TRUE;
2403
2404
84
      case PT_SCX:
2405
84
      *ptypeptr = ptscript;
2406
84
      return TRUE;
2407
158
      }
2408
2409
3
    break;  /* Non-script found */
2410
158
    }
2411
2412
274k
  if (r > 0) bot = i + 1; else top = i;
2413
274k
  }
2414
2415
31
*errorcodeptr = ERR47;   /* Unrecognized property */
2416
31
return FALSE;
2417
2418
138
ERROR_RETURN:            /* Malformed \P or \p */
2419
138
*errorcodeptr = ERR46;
2420
138
*ptrptr = ptr;
2421
138
return FALSE;
2422
42.1k
}
2423
#endif
2424
2425
2426
2427
/*************************************************
2428
*           Check for POSIX class syntax         *
2429
*************************************************/
2430
2431
/* This function is called when the sequence "[:" or "[." or "[=" is
2432
encountered in a character class. It checks whether this is followed by a
2433
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2434
reach an unescaped ']' without the special preceding character, return FALSE.
2435
2436
Originally, this function only recognized a sequence of letters between the
2437
terminators, but it seems that Perl recognizes any sequence of characters,
2438
though of course unknown POSIX names are subsequently rejected. Perl gives an
2439
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2440
didn't consider this to be a POSIX class. Likewise for [:1234:].
2441
2442
The problem in trying to be exactly like Perl is in the handling of escapes. We
2443
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2444
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2445
below handles the special cases \\ and \], but does not try to do any other
2446
escape processing. This makes it different from Perl for cases such as
2447
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2448
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2449
when Perl does, I think.
2450
2451
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2452
It seems that the appearance of a nested POSIX class supersedes an apparent
2453
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2454
a digit. This is handled by returning FALSE if the start of a new group with
2455
the same terminator is encountered, since the next closing sequence must close
2456
the nested group, not the outer one.
2457
2458
In Perl, unescaped square brackets may also appear as part of class names. For
2459
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2460
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2461
seem right at all. PCRE does not allow closing square brackets in POSIX class
2462
names.
2463
2464
Arguments:
2465
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2466
  ptrend   pointer to the end of the pattern
2467
  endptr   where to return a pointer to the terminating ':', '.', or '='
2468
2469
Returns:   TRUE or FALSE
2470
*/
2471
2472
static BOOL
2473
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2474
70.6k
{
2475
70.6k
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2476
70.6k
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2477
2478
1.01M
for (; ptrend - ptr >= 2; ptr++)
2479
1.01M
  {
2480
1.01M
  if (*ptr == CHAR_BACKSLASH &&
2481
1.01M
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2482
4.65k
    ptr++;
2483
2484
1.01M
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2485
1.01M
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2486
2487
972k
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2488
32.3k
    {
2489
32.3k
    *endptr = ptr;
2490
32.3k
    return TRUE;
2491
32.3k
    }
2492
1.01M
  }
2493
2494
234
return FALSE;
2495
70.6k
}
2496
2497
2498
2499
/*************************************************
2500
*          Check POSIX class name                *
2501
*************************************************/
2502
2503
/* This function is called to check the name given in a POSIX-style class entry
2504
such as [:alnum:].
2505
2506
Arguments:
2507
  ptr        points to the first letter
2508
  len        the length of the name
2509
2510
Returns:     a value representing the name, or -1 if unknown
2511
*/
2512
2513
static int
2514
check_posix_name(PCRE2_SPTR ptr, int len)
2515
32.0k
{
2516
32.0k
const char *pn = posix_names;
2517
32.0k
int yield = 0;
2518
305k
while (posix_name_lengths[yield] != 0)
2519
305k
  {
2520
305k
  if (len == posix_name_lengths[yield] &&
2521
305k
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2522
273k
  pn += posix_name_lengths[yield] + 1;
2523
273k
  yield++;
2524
273k
  }
2525
31
return -1;
2526
32.0k
}
2527
2528
2529
2530
/*************************************************
2531
*       Read a subpattern or VERB name           *
2532
*************************************************/
2533
2534
/* This function is called from parse_regex() below whenever it needs to read
2535
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2536
pointer must be to the preceding character. If that character is '*' we are
2537
reading a verb or alpha assertion name. The pointer is updated to point after
2538
the name, for a VERB or alpha assertion name, or after tha name's terminator
2539
for a subpattern name. Returning both the offset and the name pointer is
2540
redundant information, but some callers use one and some the other, so it is
2541
simplest just to return both. When the name is in braces, spaces and tabs are
2542
allowed (and ignored) at either end.
2543
2544
Arguments:
2545
  ptrptr      points to the character pointer variable
2546
  ptrend      points to the end of the input string
2547
  utf         true if the input is UTF-encoded
2548
  terminator  the terminator of a subpattern name must be this
2549
  offsetptr   where to put the offset from the start of the pattern
2550
  nameptr     where to put a pointer to the name in the input
2551
  namelenptr  where to put the length of the name
2552
  errcodeptr  where to put an error code
2553
  cb          pointer to the compile data block
2554
2555
Returns:    TRUE if a name was read
2556
            FALSE otherwise, with error code set
2557
*/
2558
2559
static BOOL
2560
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2561
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2562
  int *errorcodeptr, compile_block *cb)
2563
72.9k
{
2564
72.9k
PCRE2_SPTR ptr = *ptrptr;
2565
72.9k
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2566
72.9k
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2567
2568
72.9k
if (is_braced)
2569
641
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2570
2571
72.9k
if (ptr >= ptrend)                 /* No characters in name */
2572
23
  {
2573
23
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2574
23
                            ERR60; /* Verb not recognized or malformed */
2575
23
  goto FAILED;
2576
23
  }
2577
2578
72.8k
*nameptr = ptr;
2579
72.8k
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2580
2581
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2582
ought to be updated to match. */
2583
2584
/* In UTF mode, a group name may contain letters and decimal digits as defined
2585
by Unicode properties, and underscores, but must not start with a digit. */
2586
2587
72.8k
#ifdef SUPPORT_UNICODE
2588
72.8k
if (utf && is_group)
2589
594
  {
2590
594
  uint32_t c, type;
2591
2592
594
  GETCHAR(c, ptr);
2593
594
  type = UCD_CHARTYPE(c);
2594
2595
594
  if (type == ucp_Nd)
2596
3
    {
2597
3
    *errorcodeptr = ERR44;
2598
3
    goto FAILED;
2599
3
    }
2600
2601
591
  for(;;)
2602
1.68k
    {
2603
1.68k
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2604
1.68k
        c != CHAR_UNDERSCORE) break;
2605
1.11k
    ptr++;
2606
1.11k
    FORWARDCHARTEST(ptr, ptrend);
2607
1.11k
    if (ptr >= ptrend) break;
2608
1.09k
    GETCHAR(c, ptr);
2609
1.09k
    type = UCD_CHARTYPE(c);
2610
1.09k
    }
2611
591
  }
2612
72.3k
else
2613
#else
2614
(void)utf;  /* Avoid compiler warning */
2615
#endif      /* SUPPORT_UNICODE */
2616
2617
/* Handle non-group names and group names in non-UTF modes. A group name must
2618
not start with a digit. If either of the others start with a digit it just
2619
won't be recognized. */
2620
2621
72.3k
  {
2622
72.3k
  if (is_group && IS_DIGIT(*ptr))
2623
4
    {
2624
4
    *errorcodeptr = ERR44;
2625
4
    goto FAILED;
2626
4
    }
2627
2628
274k
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2629
201k
    {
2630
201k
    ptr++;
2631
201k
    }
2632
72.2k
  }
2633
2634
/* Check name length */
2635
2636
72.8k
if (ptr > *nameptr + MAX_NAME_SIZE)
2637
10
  {
2638
10
  *errorcodeptr = ERR48;
2639
10
  goto FAILED;
2640
10
  }
2641
72.8k
*namelenptr = (uint32_t)(ptr - *nameptr);
2642
2643
/* Subpattern names must not be empty, and their terminator is checked here.
2644
(What follows a verb or alpha assertion name is checked separately.) */
2645
2646
72.8k
if (is_group)
2647
36.6k
  {
2648
36.6k
  if (ptr == *nameptr)
2649
74
    {
2650
74
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2651
74
    goto FAILED;
2652
74
    }
2653
36.5k
  if (is_braced)
2654
535
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2655
36.5k
  if (terminator != 0)
2656
36.1k
    {
2657
36.1k
    if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2658
114
      {
2659
114
      *errorcodeptr = ERR42;
2660
114
      goto FAILED;
2661
114
      }
2662
35.9k
    ptr++;
2663
35.9k
    }
2664
36.5k
  }
2665
2666
72.6k
*ptrptr = ptr;
2667
72.6k
return TRUE;
2668
2669
228
FAILED:
2670
228
*ptrptr = ptr;
2671
228
return FALSE;
2672
72.8k
}
2673
2674
2675
2676
/**************************************************
2677
*        Parse capturing bracket argument list    *
2678
**************************************************/
2679
2680
/* Reads a list of capture references. The references
2681
can be numbers or names.
2682
2683
Arguments:
2684
  ptrptr           points to the character pointer variable
2685
  ptrend           points to the end of the input string
2686
  utf              true if the input is UTF-encoded
2687
  parsed_pattern   the parsed pattern pointer
2688
  offset           last known offset
2689
  errcodeptr       where to put an error code
2690
  cb               pointer to the compile data block
2691
2692
Returns: updated parsed_pattern pointer on success
2693
         NULL otherwise
2694
*/
2695
2696
static uint32_t *
2697
parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
2698
  BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,
2699
  int *errorcodeptr, compile_block *cb)
2700
517
{
2701
517
PCRE2_SIZE next_offset;
2702
517
PCRE2_SPTR ptr = *ptrptr;
2703
517
PCRE2_SPTR name;
2704
517
PCRE2_UCHAR terminator;
2705
517
uint32_t meta, namelen;
2706
517
int i;
2707
2708
517
if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)
2709
0
  {
2710
0
  *errorcodeptr = ERR118;
2711
0
  goto FAILED;
2712
0
  }
2713
2714
517
for (;;)
2715
531
  {
2716
531
  ptr++;
2717
531
  next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
2718
2719
531
  if (ptr >= ptrend)
2720
3
    {
2721
3
    *errorcodeptr = ERR117;
2722
3
    goto FAILED;
2723
3
    }
2724
2725
  /* Handle [+-]number cases */
2726
528
  if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
2727
528
      &i, errorcodeptr))
2728
474
    {
2729
474
    PCRE2_ASSERT(i >= 0);
2730
474
    if (i <= 0)
2731
7
      {
2732
7
      *errorcodeptr = ERR15;
2733
7
      goto FAILED;
2734
7
      }
2735
467
    meta = META_CAPTURE_NUMBER;
2736
467
    namelen = (uint32_t)i;
2737
467
    }
2738
54
  else if (*errorcodeptr != 0) goto FAILED; /* Number too big */
2739
31
  else
2740
31
    {
2741
    /* Handle 'name' or <name> cases. */
2742
31
    if (*ptr == CHAR_LESS_THAN_SIGN)
2743
5
      terminator = CHAR_GREATER_THAN_SIGN;
2744
26
    else if (*ptr == CHAR_APOSTROPHE)
2745
5
      terminator = CHAR_APOSTROPHE;
2746
21
    else
2747
21
      {
2748
21
      *errorcodeptr = ERR117;
2749
21
      goto FAILED;
2750
21
      }
2751
2752
10
    if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
2753
10
        &name, &namelen, errorcodeptr, cb)) goto FAILED;
2754
2755
5
    meta = META_CAPTURE_NAME;
2756
5
    }
2757
2758
472
  PCRE2_ASSERT(next_offset > 0);
2759
472
  if (offset == 0 || (next_offset - offset) >= 0x10000)
2760
0
    {
2761
0
    *parsed_pattern++ = META_OFFSET;
2762
0
    PUTOFFSET(next_offset, parsed_pattern);
2763
0
    offset = next_offset;
2764
0
    }
2765
2766
  /* The offset is encoded as a relative offset, because for some
2767
  inputs such as ",2" in (1,2,3), we only have space for two uint32_t
2768
  values, and an opcode and absolute offset may require three uint32_t
2769
  values. */
2770
472
  *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
2771
472
  *parsed_pattern++ = namelen;
2772
472
  offset = next_offset;
2773
2774
472
  if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
2775
2776
469
  if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
2777
2778
26
  if (*ptr != CHAR_COMMA)
2779
12
    {
2780
12
    *errorcodeptr = ERR24;
2781
12
    goto FAILED;
2782
12
    }
2783
26
  }
2784
2785
443
*ptrptr = ptr + 1;
2786
443
return parsed_pattern;
2787
2788
3
UNCLOSED_PARENTHESIS:
2789
3
*errorcodeptr = ERR14;
2790
2791
74
FAILED:
2792
74
*ptrptr = ptr;
2793
74
return NULL;
2794
3
}
2795
2796
2797
2798
/*************************************************
2799
*          Manage callouts at start of cycle     *
2800
*************************************************/
2801
2802
/* At the start of a new item in parse_regex() we are able to record the
2803
details of the previous item in a prior callout, and also to set up an
2804
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2805
which would otherwise happen for items such as \Q that contribute nothing to
2806
the parsed pattern.
2807
2808
Arguments:
2809
  ptr              current pattern pointer
2810
  pcalloutptr      points to a pointer to previous callout, or NULL
2811
  auto_callout     TRUE if auto_callouts are enabled
2812
  parsed_pattern   the parsed pattern pointer
2813
  cb               compile block
2814
2815
Returns: possibly updated parsed_pattern pointer.
2816
*/
2817
2818
static uint32_t *
2819
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2820
  uint32_t *parsed_pattern, compile_block *cb)
2821
14.9M
{
2822
14.9M
uint32_t *previous_callout = *pcalloutptr;
2823
2824
14.9M
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2825
1.76M
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2826
2827
14.9M
if (!auto_callout) previous_callout = NULL; else
2828
1.76M
  {
2829
1.76M
  if (previous_callout == NULL ||
2830
1.76M
      previous_callout != parsed_pattern - 4 ||
2831
1.76M
      previous_callout[3] != 255)
2832
1.76M
    {
2833
1.76M
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2834
1.76M
    parsed_pattern += 4;
2835
1.76M
    previous_callout[0] = META_CALLOUT_NUMBER;
2836
1.76M
    previous_callout[2] = 0;
2837
1.76M
    previous_callout[3] = 255;
2838
1.76M
    }
2839
1.76M
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2840
1.76M
  }
2841
2842
14.9M
*pcalloutptr = previous_callout;
2843
14.9M
return parsed_pattern;
2844
14.9M
}
2845
2846
2847
2848
/*************************************************
2849
*          Handle \d, \D, \s, \S, \w, \W         *
2850
*************************************************/
2851
2852
/* This function is called from parse_regex() below, both for freestanding
2853
escapes, and those within classes, to handle those escapes that may change when
2854
Unicode property support is requested. Note that PCRE2_UCP will never be set
2855
without Unicode support because that is checked when pcre2_compile() is called.
2856
2857
Arguments:
2858
  escape          the ESC_... value
2859
  parsed_pattern  where to add the code
2860
  options         options bits
2861
  xoptions        extra options bits
2862
2863
Returns:          updated value of parsed_pattern
2864
*/
2865
static uint32_t *
2866
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2867
  uint32_t xoptions)
2868
519k
{
2869
519k
uint32_t ascii_option = 0;
2870
519k
uint32_t prop = ESC_p;
2871
2872
519k
switch(escape)
2873
519k
  {
2874
52.9k
  case ESC_D:
2875
52.9k
  prop = ESC_P;
2876
  /* Fall through */
2877
101k
  case ESC_d:
2878
101k
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2879
101k
  break;
2880
2881
100k
  case ESC_S:
2882
100k
  prop = ESC_P;
2883
  /* Fall through */
2884
202k
  case ESC_s:
2885
202k
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2886
202k
  break;
2887
2888
32.0k
  case ESC_W:
2889
32.0k
  prop = ESC_P;
2890
  /* Fall through */
2891
216k
  case ESC_w:
2892
216k
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2893
216k
  break;
2894
519k
  }
2895
2896
519k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2897
418k
  {
2898
418k
  *parsed_pattern++ = META_ESCAPE + escape;
2899
418k
  }
2900
101k
else
2901
101k
  {
2902
101k
  *parsed_pattern++ = META_ESCAPE + prop;
2903
101k
  switch(escape)
2904
101k
    {
2905
10.8k
    case ESC_d:
2906
25.6k
    case ESC_D:
2907
25.6k
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2908
25.6k
    break;
2909
2910
17.8k
    case ESC_s:
2911
40.8k
    case ESC_S:
2912
40.8k
    *parsed_pattern++ = PT_SPACE << 16;
2913
40.8k
    break;
2914
2915
24.9k
    case ESC_w:
2916
35.3k
    case ESC_W:
2917
35.3k
    *parsed_pattern++ = PT_WORD << 16;
2918
35.3k
    break;
2919
101k
    }
2920
101k
  }
2921
2922
519k
return parsed_pattern;
2923
519k
}
2924
2925
2926
2927
/*************************************************
2928
* Maximum size of parsed_pattern for given input *
2929
*************************************************/
2930
2931
/* This function is called from parse_regex() below, to determine the amount
2932
of memory to allocate for parsed_pattern. It is also called to check whether
2933
the amount of data written respects the amount of memory allocated.
2934
2935
Arguments:
2936
  ptr             points to the start of the pattern
2937
  ptrend          points to the end of the pattern
2938
  utf             TRUE in UTF mode
2939
  options         the options bits
2940
2941
Returns:          the number of uint32_t units for parsed_pattern
2942
*/
2943
static ptrdiff_t
2944
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2945
  uint32_t options)
2946
61.8k
{
2947
61.8k
PCRE2_SIZE big32count = 0;
2948
61.8k
ptrdiff_t parsed_size_needed;
2949
2950
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2951
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2952
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2953
when literal characters greater than META_END (0x80000000) have to be coded as
2954
two units. In this case, therefore, we scan the pattern to check for such
2955
values. */
2956
2957
#if PCRE2_CODE_UNIT_WIDTH == 32
2958
if (!utf)
2959
  {
2960
  PCRE2_SPTR p;
2961
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2962
  }
2963
#else
2964
61.8k
(void)utf;  /* Avoid compiler warning */
2965
61.8k
#endif
2966
2967
61.8k
parsed_size_needed = (ptrend - ptr) + big32count;
2968
2969
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
2970
elements) for each character. This is overkill, but memory is plentiful these
2971
days. */
2972
2973
61.8k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
2974
10.4k
  parsed_size_needed += (ptrend - ptr) * 4;
2975
2976
61.8k
return parsed_size_needed;
2977
61.8k
}
2978
2979
2980
2981
/*************************************************
2982
*      Parse regex and identify named groups     *
2983
*************************************************/
2984
2985
/* This function is called first of all. It scans the pattern and does two
2986
things: (1) It identifies capturing groups and makes a table of named capturing
2987
groups so that information about them is fully available to both the compiling
2988
scans. (2) It writes a parsed version of the pattern with comments omitted and
2989
escapes processed into the parsed_pattern vector.
2990
2991
Arguments:
2992
  ptr             points to the start of the pattern
2993
  options         compiling dynamic options (may change during the scan)
2994
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
2995
  cb              pointer to the compile data block
2996
2997
Returns:   zero on success or a non-zero error code, with the
2998
             error offset placed in the cb field
2999
*/
3000
3001
/* A structure and some flags for dealing with nested groups. */
3002
3003
typedef struct nest_save {
3004
  uint16_t  nest_depth;
3005
  uint16_t  reset_group;
3006
  uint16_t  max_group;
3007
  uint16_t  flags;
3008
  uint32_t  options;
3009
  uint32_t  xoptions;
3010
} nest_save;
3011
3012
57.6k
#define NSF_RESET          0x0001u
3013
33.4k
#define NSF_CONDASSERT     0x0002u
3014
21.7k
#define NSF_ATOMICSR       0x0004u
3015
3016
/* Options that are changeable within the pattern must be tracked during
3017
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
3018
but all must be tracked so that META_OPTIONS items set the correct values for
3019
the main compiling phase. */
3020
3021
51.4k
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
3022
51.4k
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
3023
51.4k
  PCRE2_UNGREEDY)
3024
3025
51.4k
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
3026
51.4k
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
3027
51.4k
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
3028
3029
/* States used for analyzing ranges in character classes. The two OK values
3030
must be last. */
3031
3032
enum {
3033
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
3034
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
3035
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
3036
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
3037
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
3038
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
3039
};
3040
3041
/* States used for analyzing operators and operands in extended character
3042
classes. */
3043
3044
enum {
3045
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
3046
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
3047
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
3048
};
3049
3050
/* States used for determining the parse mode in character classes. The two
3051
PERL_EXT values must be last. */
3052
3053
enum {
3054
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
3055
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
3056
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
3057
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
3058
};
3059
3060
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
3061
the storing of literal values in the main parsed pattern, where they can always
3062
be quantified. */
3063
3064
#if PCRE2_CODE_UNIT_WIDTH == 32
3065
#define PARSED_LITERAL(c, p) \
3066
  { \
3067
  if (c >= META_END) *p++ = META_BIGVALUE; \
3068
  *p++ = c; \
3069
  okquantifier = TRUE; \
3070
  }
3071
#else
3072
18.0M
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
3073
#endif
3074
3075
/* Here's the actual function. */
3076
3077
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
3078
  BOOL *has_lookbehind, compile_block *cb)
3079
61.8k
{
3080
61.8k
uint32_t c;
3081
61.8k
uint32_t delimiter;
3082
61.8k
uint32_t namelen;
3083
61.8k
uint32_t class_range_state;
3084
61.8k
uint32_t class_op_state;
3085
61.8k
uint32_t class_mode_state;
3086
61.8k
uint32_t *class_start;
3087
61.8k
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
3088
61.8k
uint32_t *verbstartptr = NULL;
3089
61.8k
uint32_t *previous_callout = NULL;
3090
61.8k
uint32_t *parsed_pattern = cb->parsed_pattern;
3091
61.8k
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
3092
61.8k
uint32_t *this_parsed_item = NULL;
3093
61.8k
uint32_t *prev_parsed_item = NULL;
3094
61.8k
uint32_t meta_quantifier = 0;
3095
61.8k
uint32_t add_after_mark = 0;
3096
61.8k
uint16_t nest_depth = 0;
3097
61.8k
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
3098
61.8k
int16_t class_maxdepth_m1 = -1;
3099
61.8k
uint16_t hash;
3100
61.8k
int after_manual_callout = 0;
3101
61.8k
int expect_cond_assert = 0;
3102
61.8k
int errorcode = 0;
3103
61.8k
int escape;
3104
61.8k
int i;
3105
61.8k
BOOL inescq = FALSE;
3106
61.8k
BOOL inverbname = FALSE;
3107
61.8k
BOOL utf = (options & PCRE2_UTF) != 0;
3108
61.8k
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
3109
61.8k
BOOL is_dupname;
3110
61.8k
BOOL negate_class;
3111
61.8k
BOOL okquantifier = FALSE;
3112
61.8k
PCRE2_SPTR thisptr;
3113
61.8k
PCRE2_SPTR name;
3114
61.8k
PCRE2_SPTR ptrend = cb->end_pattern;
3115
61.8k
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
3116
61.8k
PCRE2_SPTR class_range_forbid_ptr = NULL;
3117
61.8k
named_group *ng;
3118
61.8k
nest_save *top_nest, *end_nests;
3119
#ifdef PCRE2_DEBUG
3120
uint32_t *parsed_pattern_check;
3121
ptrdiff_t parsed_pattern_extra = 0;
3122
ptrdiff_t parsed_pattern_extra_check = 0;
3123
PCRE2_SPTR ptr_check;
3124
#endif
3125
3126
61.8k
PCRE2_ASSERT(parsed_pattern != NULL);
3127
3128
/* Insert leading items for word and line matching (features provided for the
3129
benefit of pcre2grep). */
3130
3131
61.8k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
3132
0
  {
3133
0
  *parsed_pattern++ = META_CIRCUMFLEX;
3134
0
  *parsed_pattern++ = META_NOCAPTURE;
3135
0
  }
3136
61.8k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
3137
0
  {
3138
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
3139
0
  *parsed_pattern++ = META_NOCAPTURE;
3140
0
  }
3141
3142
#ifdef PCRE2_DEBUG
3143
parsed_pattern_check = parsed_pattern;
3144
ptr_check = ptr;
3145
#endif
3146
3147
/* If the pattern is actually a literal string, process it separately to avoid
3148
cluttering up the main loop. */
3149
3150
61.8k
if ((options & PCRE2_LITERAL) != 0)
3151
0
  {
3152
0
  while (ptr < ptrend)
3153
0
    {
3154
0
    if (parsed_pattern >= parsed_pattern_end)
3155
0
      {
3156
0
      PCRE2_DEBUG_UNREACHABLE();
3157
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3158
0
      goto FAILED;
3159
0
      }
3160
0
    thisptr = ptr;
3161
0
    GETCHARINCTEST(c, ptr);
3162
0
    if (auto_callout)
3163
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3164
0
        auto_callout, parsed_pattern, cb);
3165
0
    PARSED_LITERAL(c, parsed_pattern);
3166
0
    }
3167
0
  goto PARSED_END;
3168
0
  }
3169
3170
/* Process a real regex which may contain meta-characters. */
3171
3172
61.8k
top_nest = NULL;
3173
61.8k
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3174
3175
/* The size of the nest_save structure might not be a factor of the size of the
3176
workspace. Therefore we must round down end_nests so as to correctly avoid
3177
creating a nest_save that spans the end of the workspace. */
3178
3179
61.8k
end_nests = (nest_save *)((char *)end_nests -
3180
61.8k
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3181
3182
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3183
3184
61.8k
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3185
3186
/* Now scan the pattern */
3187
3188
16.5M
while (ptr < ptrend)
3189
16.4M
  {
3190
16.4M
  int prev_expect_cond_assert;
3191
16.4M
  uint32_t min_repeat = 0, max_repeat = 0;
3192
16.4M
  uint32_t set, unset, *optset;
3193
16.4M
  uint32_t xset, xunset, *xoptset;
3194
16.4M
  uint32_t terminator;
3195
16.4M
  uint32_t prev_meta_quantifier;
3196
16.4M
  BOOL prev_okquantifier;
3197
16.4M
  PCRE2_SPTR tempptr;
3198
16.4M
  PCRE2_SIZE offset;
3199
3200
16.4M
  if (nest_depth > cb->cx->parens_nest_limit)
3201
3
    {
3202
3
    errorcode = ERR19;
3203
3
    goto FAILED;        /* Parentheses too deeply nested */
3204
3
    }
3205
3206
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3207
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3208
  write a little bit too much, everything will appear to be OK, because the
3209
  upfront size is an overestimate... but a malicious pattern could end up
3210
  forcing a write past the buffer end. We must catch this during
3211
  development. */
3212
3213
#ifdef PCRE2_DEBUG
3214
  /* Strong post-write check. Won't help in release builds - at this point
3215
  the write has already occurred so it's too late. However, should stop us
3216
  committing unsafe code. */
3217
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3218
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3219
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3220
  parsed_pattern_check = parsed_pattern;
3221
  parsed_pattern_extra_check = parsed_pattern_extra;
3222
  ptr_check = ptr;
3223
#endif
3224
3225
16.4M
  if (parsed_pattern >= parsed_pattern_end)
3226
0
    {
3227
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3228
    (but the code below can write many chars). Better than nothing. */
3229
0
    PCRE2_DEBUG_UNREACHABLE();
3230
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3231
0
    goto FAILED;
3232
0
    }
3233
3234
  /* If the last time round this loop something was added, parsed_pattern will
3235
  no longer be equal to this_parsed_item. Remember where the previous item
3236
  started and reset for the next item. Note that sometimes round the loop,
3237
  nothing gets added (e.g. for ignored white space). */
3238
3239
16.4M
  if (this_parsed_item != parsed_pattern)
3240
16.1M
    {
3241
16.1M
    prev_parsed_item = this_parsed_item;
3242
16.1M
    this_parsed_item = parsed_pattern;
3243
16.1M
    }
3244
3245
  /* Get next input character, save its position for callout handling. */
3246
3247
16.4M
  thisptr = ptr;
3248
16.4M
  GETCHARINCTEST(c, ptr);
3249
3250
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3251
  callouts, except when processing a (*VERB) "name".  */
3252
3253
16.4M
  if (inescq)
3254
210k
    {
3255
210k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3256
948
      {
3257
948
      inescq = FALSE;
3258
948
      ptr++;   /* Skip E */
3259
948
      }
3260
209k
    else
3261
209k
      {
3262
209k
      if (expect_cond_assert > 0)   /* A literal is not allowed if we are */
3263
3
        {                           /* expecting a conditional assertion, */
3264
3
        ptr--;                      /* but an empty \Q\E sequence is OK.  */
3265
3
        errorcode = ERR28;
3266
3
        goto FAILED;
3267
3
        }
3268
209k
      if (inverbname)
3269
51.7k
        {                          /* Don't use PARSED_LITERAL() because it */
3270
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3271
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3272
#endif
3273
51.7k
        *parsed_pattern++ = c;
3274
51.7k
        }
3275
157k
      else
3276
157k
        {
3277
157k
        if (after_manual_callout-- <= 0)
3278
157k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3279
157k
            auto_callout, parsed_pattern, cb);
3280
157k
        PARSED_LITERAL(c, parsed_pattern);
3281
157k
        }
3282
209k
      meta_quantifier = 0;
3283
209k
      }
3284
210k
    continue;  /* Next character */
3285
210k
    }
3286
3287
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3288
  characters up to the closing parenthesis are literals except when
3289
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3290
  and \E and escaped characters are allowed (no character types such as \d). If
3291
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3292
  this by not entering the special (*VERB:NAME) processing - they are then
3293
  picked up below. Note that c is a character, not a code unit, so we must not
3294
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3295
  TRUE in 8-bit mode. */
3296
3297
16.2M
  if (inverbname &&
3298
16.2M
       (
3299
        /* EITHER: not both options set */
3300
136k
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3301
136k
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3302
136k
#ifdef SUPPORT_UNICODE
3303
        /* OR: character > 255 AND not Unicode Pattern White Space */
3304
136k
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3305
136k
#endif
3306
        /* OR: not a # comment or isspace() white space */
3307
136k
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3308
9.64k
#ifdef SUPPORT_UNICODE
3309
        /* and not CHAR_NEL when Unicode is supported */
3310
9.64k
          && c != CHAR_NEL
3311
9.64k
#endif
3312
9.64k
       )))
3313
135k
    {
3314
135k
    PCRE2_SIZE verbnamelength;
3315
3316
135k
    switch(c)
3317
135k
      {
3318
122k
      default:                     /* Don't use PARSED_LITERAL() because it */
3319
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3320
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3321
#endif
3322
122k
      *parsed_pattern++ = c;
3323
122k
      break;
3324
3325
8.78k
      case CHAR_RIGHT_PARENTHESIS:
3326
8.78k
      inverbname = FALSE;
3327
      /* This is the length in characters */
3328
8.78k
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3329
      /* But the limit on the length is in code units */
3330
8.78k
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3331
7
        {
3332
7
        ptr--;
3333
7
        errorcode = ERR76;
3334
7
        goto FAILED;
3335
7
        }
3336
8.78k
      *verblengthptr = (uint32_t)verbnamelength;
3337
3338
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3339
      a (*MARK) was generated for the name. We now add the original verb as the
3340
      next item. */
3341
3342
8.78k
      if (add_after_mark != 0)
3343
1.11k
        {
3344
1.11k
        *parsed_pattern++ = add_after_mark;
3345
1.11k
        add_after_mark = 0;
3346
1.11k
        }
3347
8.78k
      break;
3348
3349
4.59k
      case CHAR_BACKSLASH:
3350
4.59k
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3351
1.01k
        {
3352
1.01k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3353
1.01k
          xoptions, cb->bracount, FALSE, cb);
3354
1.01k
        if (errorcode != 0) goto FAILED;
3355
1.01k
        }
3356
3.58k
      else escape = 0;   /* Treat all as literal */
3357
3358
4.59k
      switch(escape)
3359
4.59k
        {
3360
3.84k
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3361
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3362
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3363
#endif
3364
3.84k
        *parsed_pattern++ = c;
3365
3.84k
        break;
3366
3367
0
        case ESC_ub:
3368
0
        *parsed_pattern++ = CHAR_u;
3369
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3370
0
        break;
3371
3372
372
        case ESC_Q:
3373
372
        inescq = TRUE;
3374
372
        break;
3375
3376
351
        case ESC_E:           /* Ignore */
3377
351
        break;
3378
3379
29
        default:
3380
29
        errorcode = ERR40;    /* Invalid in verb name */
3381
29
        goto FAILED;
3382
4.59k
        }
3383
135k
      }
3384
135k
    continue;   /* Next character in pattern */
3385
135k
    }
3386
3387
  /* Not a verb name character. At this point we must process everything that
3388
  must not change the quantification state. This is mainly comments, but we
3389
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3390
  A+, as in Perl. An isolated \E is ignored. */
3391
3392
16.1M
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3393
975k
    {
3394
975k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3395
2.79k
      {
3396
2.79k
      inescq = *ptr == CHAR_Q;
3397
2.79k
      ptr++;
3398
2.79k
      continue;
3399
2.79k
      }
3400
975k
    }
3401
3402
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3403
  character, not a code unit, so we must not use MAX_255 to test its size
3404
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3405
  whitespace characters are those designated as "Pattern White Space" by
3406
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3407
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3408
  subset of space characters that match \h and \v. */
3409
3410
16.1M
  if ((options & PCRE2_EXTENDED) != 0)
3411
4.07M
    {
3412
4.07M
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3413
3.94M
#ifdef SUPPORT_UNICODE
3414
3.94M
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3415
3.93M
#endif
3416
3.93M
    if (c == CHAR_NUMBER_SIGN)
3417
2.47k
      {
3418
230k
      while (ptr < ptrend)
3419
229k
        {
3420
229k
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3421
1.68k
          {                       /* IS_NEWLINE sets cb->nllen. */
3422
1.68k
          ptr += cb->nllen;
3423
1.68k
          break;
3424
1.68k
          }
3425
228k
        ptr++;
3426
228k
#ifdef SUPPORT_UNICODE
3427
228k
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3428
228k
#endif
3429
228k
        }
3430
2.47k
      continue;  /* Next character in pattern */
3431
2.47k
      }
3432
3.93M
    }
3433
3434
  /* Skip over bracketed comments */
3435
3436
15.9M
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3437
15.9M
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3438
358
    {
3439
3.53k
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3440
358
    if (ptr >= ptrend)
3441
23
      {
3442
23
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3443
23
      goto FAILED;        /* to make it easier to debug. */
3444
23
      }
3445
335
    ptr++;
3446
335
    continue;  /* Next character in pattern */
3447
358
    }
3448
3449
  /* If the next item is not a quantifier, fill in length of any previous
3450
  callout and create an auto callout if required. */
3451
3452
15.9M
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3453
15.9M
       (c != CHAR_LEFT_CURLY_BRACKET ||
3454
14.9M
         (tempptr = ptr,
3455
369k
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3456
14.7M
    {
3457
14.7M
    if (after_manual_callout-- <= 0)
3458
14.6M
      {
3459
14.6M
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3460
14.6M
        parsed_pattern, cb);
3461
14.6M
      this_parsed_item = parsed_pattern;  /* New start for current item */
3462
14.6M
      }
3463
14.7M
    }
3464
3465
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3466
  assertion, possibly preceded by a callout. If the value is 1, we have just
3467
  had the callout and expect an assertion. There must be at least 3 more
3468
  characters in all cases. When expect_cond_assert is 2, we know that the
3469
  current character is an opening parenthesis, as otherwise we wouldn't be
3470
  here. However, when it is 1, we need to check, and it's easiest just to check
3471
  always. Note that expect_cond_assert may be negative, since all callouts just
3472
  decrement it. */
3473
3474
15.9M
  if (expect_cond_assert > 0)
3475
15.4k
    {
3476
15.4k
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3477
15.4k
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3478
15.4k
    if (ok)
3479
15.4k
      {
3480
15.4k
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3481
83
        {
3482
83
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3483
83
        }
3484
15.3k
      else switch(ptr[1])  /* Traditional symbolic format */
3485
15.3k
        {
3486
3.25k
        case CHAR_C:
3487
3.25k
        ok = expect_cond_assert == 2;
3488
3.25k
        break;
3489
3490
7.14k
        case CHAR_EQUALS_SIGN:
3491
8.71k
        case CHAR_EXCLAMATION_MARK:
3492
8.71k
        break;
3493
3494
3.35k
        case CHAR_LESS_THAN_SIGN:
3495
3.35k
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3496
3.35k
        break;
3497
3498
8
        default:
3499
8
        ok = FALSE;
3500
15.3k
        }
3501
15.4k
      }
3502
3503
15.4k
    if (!ok)
3504
34
      {
3505
34
      ptr--;   /* Adjust error offset */
3506
34
      errorcode = ERR28;
3507
34
      goto FAILED;
3508
34
      }
3509
15.4k
    }
3510
3511
  /* Remember whether we are expecting a conditional assertion, and set the
3512
  default for this item. */
3513
3514
15.9M
  prev_expect_cond_assert = expect_cond_assert;
3515
15.9M
  expect_cond_assert = 0;
3516
3517
  /* Remember quantification status for the previous significant item, then set
3518
  default for this item. */
3519
3520
15.9M
  prev_okquantifier = okquantifier;
3521
15.9M
  prev_meta_quantifier = meta_quantifier;
3522
15.9M
  okquantifier = FALSE;
3523
15.9M
  meta_quantifier = 0;
3524
3525
  /* If the previous significant item was a quantifier, adjust the parsed code
3526
  if there is a following modifier. The base meta value is always followed by
3527
  the PLUS and QUERY values, in that order. We do this here rather than after
3528
  reading a quantifier so that intervening comments and /x whitespace can be
3529
  ignored without having to replicate code. */
3530
3531
15.9M
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3532
154k
    {
3533
154k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3534
154k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3535
138k
        0x00020000u : 0x00010000u);
3536
154k
    continue;  /* Next character in pattern */
3537
154k
    }
3538
3539
  /* Process the next item in the main part of a pattern. */
3540
3541
15.8M
  switch(c)
3542
15.8M
    {
3543
10.8M
    default:              /* Non-special character */
3544
10.8M
    PARSED_LITERAL(c, parsed_pattern);
3545
10.8M
    break;
3546
3547
3548
    /* ---- Escape sequence ---- */
3549
3550
972k
    case CHAR_BACKSLASH:
3551
972k
    tempptr = ptr;
3552
972k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3553
972k
      xoptions, cb->bracount, FALSE, cb);
3554
972k
    if (errorcode != 0)
3555
595
      {
3556
849
      ESCAPE_FAILED:
3557
849
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3558
849
        goto FAILED;
3559
0
      ptr = tempptr;
3560
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3561
0
        {
3562
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3563
0
        }
3564
0
      escape = 0;                 /* Treat as literal character */
3565
0
      }
3566
3567
    /* The escape was a data escape or literal character. */
3568
3569
971k
    if (escape == 0)
3570
305k
      {
3571
305k
      PARSED_LITERAL(c, parsed_pattern);
3572
305k
      }
3573
3574
    /* The escape was a back (or forward) reference. We keep the offset in
3575
    order to give a more useful diagnostic for a bad forward reference. For
3576
    references to groups numbered less than 10 we can't use more than two items
3577
    in parsed_pattern because they may be just two characters in the input (and
3578
    in a 64-bit world an offset may need two elements). So for them, the offset
3579
    of the first occurrent is held in a special vector. */
3580
3581
666k
    else if (escape < 0)
3582
30.4k
      {
3583
30.4k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 1);
3584
30.4k
      escape = -escape - 1;
3585
30.4k
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3586
30.4k
      if (escape < 10)
3587
18.9k
        {
3588
18.9k
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3589
6.79k
          cb->small_ref_offset[escape] = offset;
3590
18.9k
        }
3591
11.4k
      else
3592
11.4k
        {
3593
11.4k
        PUTOFFSET(offset, parsed_pattern);
3594
11.4k
        }
3595
30.4k
      okquantifier = TRUE;
3596
30.4k
      }
3597
3598
    /* The escape was a character class such as \d etc. or other special
3599
    escape indicator such as \A or \X. Most of them generate just a single
3600
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3601
    value. They are supported only when Unicode is available. The type and
3602
    value are packed into a single 32-bit value so that the whole sequences
3603
    uses only two elements in the parsed_vector. This is because the same
3604
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3605
    set.
3606
3607
    There are also some cases where the escape sequence is followed by a name:
3608
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3609
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3610
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3611
    and returned as a negative value (handled above). A name is coded as an
3612
    offset into the pattern and a length. */
3613
3614
636k
    else switch (escape)
3615
636k
      {
3616
5
      case ESC_C:
3617
5
#ifdef NEVER_BACKSLASH_C
3618
5
      errorcode = ERR85;
3619
5
      goto ESCAPE_FAILED;
3620
#else
3621
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3622
        {
3623
        errorcode = ERR83;
3624
        goto ESCAPE_FAILED;
3625
        }
3626
#endif
3627
0
      okquantifier = TRUE;
3628
0
      *parsed_pattern++ = META_ESCAPE + escape;
3629
0
      break;
3630
3631
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3632
      when \u{ is not followed by hex digits and }. It requests two literal
3633
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3634
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3635
3636
0
      case ESC_ub:
3637
0
      *parsed_pattern++ = CHAR_u;
3638
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3639
0
      break;
3640
3641
10.7k
      case ESC_X:
3642
#ifndef SUPPORT_UNICODE
3643
      errorcode = ERR45;   /* Supported only with Unicode support */
3644
      goto ESCAPE_FAILED;
3645
#endif
3646
43.6k
      case ESC_H:
3647
81.4k
      case ESC_h:
3648
90.2k
      case ESC_N:
3649
123k
      case ESC_R:
3650
133k
      case ESC_V:
3651
145k
      case ESC_v:
3652
145k
      okquantifier = TRUE;
3653
145k
      *parsed_pattern++ = META_ESCAPE + escape;
3654
145k
      break;
3655
3656
56.7k
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3657
56.7k
      *parsed_pattern++ = META_ESCAPE + escape;
3658
56.7k
      break;
3659
3660
      /* Escapes that may change in UCP mode. */
3661
3662
34.6k
      case ESC_d:
3663
74.6k
      case ESC_D:
3664
123k
      case ESC_s:
3665
208k
      case ESC_S:
3666
374k
      case ESC_w:
3667
396k
      case ESC_W:
3668
396k
      okquantifier = TRUE;
3669
396k
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3670
396k
        xoptions);
3671
396k
      break;
3672
3673
      /* Unicode property matching */
3674
3675
15.7k
      case ESC_P:
3676
21.6k
      case ESC_p:
3677
21.6k
#ifdef SUPPORT_UNICODE
3678
21.6k
        {
3679
21.6k
        BOOL negated;
3680
21.6k
        uint16_t ptype = 0, pdata = 0;
3681
21.6k
        if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
3682
144
          goto ESCAPE_FAILED;
3683
21.5k
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3684
21.5k
        *parsed_pattern++ = META_ESCAPE + escape;
3685
21.5k
        *parsed_pattern++ = (ptype << 16) | pdata;
3686
21.5k
        okquantifier = TRUE;
3687
21.5k
        }
3688
#else
3689
      errorcode = ERR45;
3690
      goto ESCAPE_FAILED;
3691
#endif
3692
0
      break;  /* End \P and \p */
3693
3694
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3695
      numerical or named subroutine call, and control comes here. When used
3696
      with brace delimiters it is a numerical back reference and does not come
3697
      here because check_escape() returns it directly as a reference. \k is
3698
      always a named back reference. */
3699
3700
252
      case ESC_g:
3701
16.1k
      case ESC_k:
3702
16.1k
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3703
16.1k
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3704
26
        {
3705
26
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3706
26
        goto ESCAPE_FAILED;
3707
26
        }
3708
16.1k
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3709
16.0k
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3710
15.9k
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3711
3712
      /* For a non-braced \g, check for a numerical recursion. */
3713
3714
16.1k
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3715
252
        {
3716
252
        PCRE2_SPTR p = ptr + 1;
3717
3718
252
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3719
252
            &errorcode))
3720
139
          {
3721
139
          if (p >= ptrend || *p != terminator)
3722
10
            {
3723
10
            errorcode = ERR57;
3724
10
            goto ESCAPE_FAILED;
3725
10
            }
3726
129
          ptr = p + 1;
3727
129
          goto SET_RECURSION;
3728
139
          }
3729
113
        if (errorcode != 0) goto ESCAPE_FAILED;
3730
113
        }
3731
3732
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3733
      before } but not for other delimiters. */
3734
3735
15.9k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3736
15.9k
          &errorcode, cb)) goto ESCAPE_FAILED;
3737
3738
      /* \k and \g when used with braces are back references, whereas \g used
3739
      with quotes or angle brackets is a recursion */
3740
3741
15.9k
      *parsed_pattern++ =
3742
15.9k
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3743
15.8k
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3744
15.9k
      *parsed_pattern++ = namelen;
3745
3746
15.9k
      PUTOFFSET(offset, parsed_pattern);
3747
15.9k
      okquantifier = TRUE;
3748
15.9k
      break;  /* End special escape processing */
3749
636k
      }
3750
971k
    break;    /* End escape sequence processing */
3751
3752
3753
    /* ---- Single-character special items ---- */
3754
3755
971k
    case CHAR_CIRCUMFLEX_ACCENT:
3756
192k
    *parsed_pattern++ = META_CIRCUMFLEX;
3757
192k
    break;
3758
3759
231k
    case CHAR_DOLLAR_SIGN:
3760
231k
    *parsed_pattern++ = META_DOLLAR;
3761
231k
    break;
3762
3763
156k
    case CHAR_DOT:
3764
156k
    *parsed_pattern++ = META_DOT;
3765
156k
    okquantifier = TRUE;
3766
156k
    break;
3767
3768
3769
    /* ---- Single-character quantifiers ---- */
3770
3771
301k
    case CHAR_ASTERISK:
3772
301k
    meta_quantifier = META_ASTERISK;
3773
301k
    goto CHECK_QUANTIFIER;
3774
3775
340k
    case CHAR_PLUS:
3776
340k
    meta_quantifier = META_PLUS;
3777
340k
    goto CHECK_QUANTIFIER;
3778
3779
258k
    case CHAR_QUESTION_MARK:
3780
258k
    meta_quantifier = META_QUERY;
3781
258k
    goto CHECK_QUANTIFIER;
3782
3783
3784
    /* ---- Potential {n,m} quantifier ---- */
3785
3786
369k
    case CHAR_LEFT_CURLY_BRACKET:
3787
369k
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3788
369k
        &errorcode))
3789
163k
      {
3790
163k
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3791
163k
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3792
163k
      break;                               /* No more quantifier processing */
3793
163k
      }
3794
205k
    meta_quantifier = META_MINMAX;
3795
    /* Fall through */
3796
3797
3798
    /* ---- Quantifier post-processing ---- */
3799
3800
    /* Check that a quantifier is allowed after the previous item. This
3801
    guarantees that there is a previous item. */
3802
3803
1.10M
    CHECK_QUANTIFIER:
3804
1.10M
    if (!prev_okquantifier)
3805
197
      {
3806
197
      errorcode = ERR9;
3807
197
      goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
3808
197
      }
3809
3810
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3811
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3812
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3813
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3814
    (*MARK) for when (*ACCEPT) has an argument. */
3815
3816
1.10M
    if (*prev_parsed_item == META_ACCEPT)
3817
1.26k
      {
3818
1.26k
      uint32_t *p;
3819
37.4k
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3820
1.26k
      *verbstartptr = META_NOCAPTURE;
3821
1.26k
      parsed_pattern[1] = META_KET;
3822
1.26k
      parsed_pattern += 2;
3823
3824
#ifdef PCRE2_DEBUG
3825
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3826
      parsed_pattern_extra -= 2;
3827
#endif
3828
1.26k
      }
3829
3830
    /* Now we can put the quantifier into the parsed pattern vector. At this
3831
    stage, we have only the basic quantifier. The check for a following + or ?
3832
    modifier happens at the top of the loop, after any intervening comments
3833
    have been removed. */
3834
3835
1.10M
    *parsed_pattern++ = meta_quantifier;
3836
1.10M
    if (c == CHAR_LEFT_CURLY_BRACKET)
3837
205k
      {
3838
205k
      *parsed_pattern++ = min_repeat;
3839
205k
      *parsed_pattern++ = max_repeat;
3840
205k
      }
3841
1.10M
    break;
3842
3843
3844
    /* ---- Character class ---- */
3845
3846
491k
    case CHAR_LEFT_SQUARE_BRACKET:
3847
3848
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3849
    used for "start of word" and "end of word". As these are otherwise illegal
3850
    sequences, we don't break anything by recognizing them. They are replaced
3851
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3852
    erroneous and are handled by the normal code below. */
3853
3854
491k
    if (ptrend - ptr >= 6 &&
3855
491k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3856
490k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3857
949
      {
3858
949
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3859
3860
949
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3861
344
        {
3862
344
        *parsed_pattern++ = META_LOOKAHEAD;
3863
344
        }
3864
605
      else
3865
605
        {
3866
605
        *parsed_pattern++ = META_LOOKBEHIND;
3867
605
        *has_lookbehind = TRUE;
3868
3869
        /* The offset is used only for the "non-fixed length" error; this won't
3870
        occur here, so just store zero. */
3871
3872
605
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3873
605
        }
3874
3875
949
      if ((options & PCRE2_UCP) == 0)
3876
485
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3877
464
      else
3878
464
        {
3879
464
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3880
464
        *parsed_pattern++ = PT_WORD << 16;
3881
464
        }
3882
949
      *parsed_pattern++ = META_KET;
3883
949
      ptr += 6;
3884
949
      okquantifier = TRUE;
3885
949
      break;
3886
949
      }
3887
3888
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3889
    they are encountered at the top level, so we'll do that too. */
3890
3891
490k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3892
490k
         *ptr == CHAR_EQUALS_SIGN) &&
3893
490k
        check_posix_syntax(ptr, ptrend, &tempptr))
3894
295
      {
3895
295
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3896
295
      goto FAILED;
3897
295
      }
3898
3899
490k
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3900
427k
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3901
3902
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3903
    set c to the '[' character, and ptr to just after the '['. */
3904
3905
490k
    FROM_PERL_EXTENDED_CLASS:
3906
490k
    okquantifier = TRUE;
3907
3908
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3909
    because there are holes in the encoding, and simply using the range A-Z
3910
    (for example) would include the characters in the holes. This applies only
3911
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3912
    in this respect. In order to accommodate this, we keep track of whether
3913
    character values are literal or not, and a state variable for handling
3914
    ranges. */
3915
3916
    /* Loop for the contents of the class. Classes may be nested, if
3917
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3918
3919
    /* c is still set to '[' so the loop will handle the start of the class. */
3920
3921
490k
    class_depth_m1 = -1;
3922
490k
    class_maxdepth_m1 = -1;
3923
490k
    class_range_state = RANGE_NO;
3924
490k
    class_op_state = CLASS_OP_EMPTY;
3925
490k
    class_start = NULL;
3926
3927
490k
    for (;;)
3928
7.89M
      {
3929
7.89M
      BOOL char_is_literal = TRUE;
3930
3931
      /* Inside \Q...\E everything is literal except \E */
3932
3933
7.89M
      if (inescq)
3934
1.16k
        {
3935
1.16k
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3936
107
          {
3937
107
          inescq = FALSE;                   /* Reset literal state */
3938
107
          ptr++;                            /* Skip the 'E' */
3939
107
          goto CLASS_CONTINUE;
3940
107
          }
3941
3942
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3943
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3944
        were're only giving an error if the \Q..\E is non-empty. */
3945
3946
1.05k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3947
3
          {
3948
3
          errorcode = ERR116;
3949
3
          goto FAILED;
3950
3
          }
3951
3952
1.05k
        goto CLASS_LITERAL;
3953
1.05k
        }
3954
3955
      /* Skip over space and tab (only) in extended-more mode, or anywhere
3956
      inside a Perl extended class (which implies /xx). */
3957
3958
7.89M
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
3959
7.89M
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
3960
87.5k
           class_mode_state >= CLASS_MODE_PERL_EXT))
3961
17.5k
        goto CLASS_CONTINUE;
3962
3963
      /* Handle POSIX class names. Perl allows a negation extension of the
3964
      form [:^name:]. A square bracket that doesn't match the syntax is
3965
      treated as a literal. We also recognize the POSIX constructions
3966
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3967
      5.6 and 5.8 do. */
3968
3969
7.87M
      if (class_depth_m1 >= 0 &&
3970
7.87M
          c == CHAR_LEFT_SQUARE_BRACKET &&
3971
7.87M
          ptrend - ptr >= 3 &&
3972
7.87M
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3973
405k
           *ptr == CHAR_EQUALS_SIGN) &&
3974
7.87M
          check_posix_syntax(ptr, ptrend, &tempptr))
3975
32.0k
        {
3976
32.0k
        BOOL posix_negate = FALSE;
3977
32.0k
        int posix_class;
3978
3979
        /* Perl treats a hyphen before a POSIX class as a literal, not the
3980
        start of a range. However, it gives a warning in its warning mode. PCRE
3981
        does not have a warning mode, so we give an error, because this is
3982
        likely an error on the user's part. */
3983
3984
32.0k
        if (class_range_state == RANGE_STARTED)
3985
4
          {
3986
4
          ptr = tempptr + 2;
3987
4
          errorcode = ERR50;
3988
4
          goto FAILED;
3989
4
          }
3990
3991
        /* Perl treats a hyphen after a POSIX class as a literal, not the
3992
        start of a range. However, it gives a warning in its warning mode
3993
        unless the hyphen is the last character in the class. PCRE does not
3994
        have a warning mode, so we give an error, because this is likely an
3995
        error on the user's part.
3996
3997
        Roll back to the hyphen for the error position. */
3998
3999
32.0k
        if (class_range_state == RANGE_FORBID_STARTED)
4000
5
          {
4001
5
          ptr = class_range_forbid_ptr;
4002
5
          errorcode = ERR50;
4003
5
          goto FAILED;
4004
5
          }
4005
4006
        /* Disallow implicit union in Perl extended classes. */
4007
4008
32.0k
        if (class_op_state == CLASS_OP_OPERAND &&
4009
32.0k
            class_mode_state == CLASS_MODE_PERL_EXT)
4010
3
          {
4011
3
          ptr = tempptr + 2;
4012
3
          errorcode = ERR113;
4013
3
          goto FAILED;
4014
3
          }
4015
4016
32.0k
        if (*ptr != CHAR_COLON)
4017
3
          {
4018
3
          ptr = tempptr + 2;
4019
3
          errorcode = ERR13;
4020
3
          goto FAILED;
4021
3
          }
4022
4023
32.0k
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
4024
752
          {
4025
752
          posix_negate = TRUE;
4026
752
          ptr++;
4027
752
          }
4028
4029
32.0k
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4030
32.0k
        ptr = tempptr + 2;
4031
32.0k
        if (posix_class < 0)
4032
31
          {
4033
31
          errorcode = ERR30;
4034
31
          goto FAILED;
4035
31
          }
4036
4037
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
4038
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
4039
        (because it would be interpreted as range). */
4040
4041
31.9k
        class_range_state = RANGE_FORBID_NO;
4042
31.9k
        class_op_state = CLASS_OP_OPERAND;
4043
4044
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
4045
        of the POSIX classes are converted to use Unicode properties \p or \P
4046
        or, in one case, \h or \H. The substitutes table has two values per
4047
        class, containing the type and value of a \p or \P item. The special
4048
        cases are specified with a negative type: a non-zero value causes \h or
4049
        \H to be used, and a zero value falls through to behave like a non-UCP
4050
        POSIX class. There are now also some extra options that force ASCII for
4051
        some classes. */
4052
4053
31.9k
#ifdef SUPPORT_UNICODE
4054
31.9k
        if ((options & PCRE2_UCP) != 0 &&
4055
31.9k
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
4056
31.9k
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
4057
13.8k
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
4058
13.6k
          {
4059
13.6k
          int ptype = posix_substitutes[2*posix_class];
4060
13.6k
          int pvalue = posix_substitutes[2*posix_class + 1];
4061
4062
13.6k
          if (ptype >= 0)
4063
12.5k
            {
4064
12.5k
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
4065
12.5k
            *parsed_pattern++ = (ptype << 16) | pvalue;
4066
12.5k
            goto CLASS_CONTINUE;
4067
12.5k
            }
4068
4069
1.09k
          if (pvalue != 0)
4070
341
            {
4071
341
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
4072
341
            goto CLASS_CONTINUE;
4073
341
            }
4074
4075
          /* Fall through */
4076
1.09k
          }
4077
19.1k
#endif  /* SUPPORT_UNICODE */
4078
4079
        /* Non-UCP POSIX class */
4080
4081
19.1k
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
4082
19.1k
        *parsed_pattern++ = posix_class;
4083
19.1k
        }
4084
4085
      /* Check for the start of the outermost class, or the start of a nested class. */
4086
4087
7.84M
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
4088
7.84M
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
4089
863k
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
4090
7.84M
               (c == CHAR_LEFT_PARENTHESIS &&
4091
7.27M
                class_mode_state == CLASS_MODE_PERL_EXT))
4092
564k
        {
4093
564k
        uint32_t start_c = c;
4094
564k
        uint32_t new_class_mode_state;
4095
4096
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
4097
        class. */
4098
4099
564k
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
4100
564k
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
4101
700
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
4102
563k
        else
4103
563k
          new_class_mode_state = class_mode_state;
4104
4105
        /* Tidy up the other class before starting the nested class. */
4106
        /* -[ beginning a nested class is a literal '-' */
4107
4108
564k
        if (class_range_state == RANGE_STARTED)
4109
124
          parsed_pattern[-1] = CHAR_MINUS;
4110
4111
        /* Disallow implicit union in Perl extended classes. */
4112
4113
564k
        if (class_op_state == CLASS_OP_OPERAND &&
4114
564k
            class_mode_state == CLASS_MODE_PERL_EXT)
4115
5
          {
4116
5
          errorcode = ERR113;
4117
5
          goto FAILED;
4118
5
          }
4119
4120
        /* Validate nesting depth */
4121
564k
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
4122
18
          {
4123
18
          errorcode = ERR107;
4124
18
          goto FAILED;        /* Classes too deeply nested */
4125
18
          }
4126
4127
        /* Process the character class start. If the first character is '^', set
4128
        the negation flag. If the first few characters (either before or after ^)
4129
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
4130
        This makes for compatibility with Perl. */
4131
4132
564k
        negate_class = FALSE;
4133
564k
        for (;;)
4134
798k
          {
4135
798k
          if (ptr >= ptrend)
4136
36
            {
4137
36
            if (start_c == CHAR_LEFT_PARENTHESIS)
4138
3
              errorcode = ERR14;  /* Missing terminating ')' */
4139
33
            else
4140
33
              errorcode = ERR6;   /* Missing terminating ']' */
4141
36
            goto FAILED;
4142
36
            }
4143
4144
798k
          GETCHARINCTEST(c, ptr);
4145
798k
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
4146
798k
          else if (c == CHAR_BACKSLASH)
4147
28.1k
            {
4148
28.1k
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
4149
27.6k
            else if (ptrend - ptr >= 3 &&
4150
27.6k
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4151
74
              ptr += 3;
4152
27.5k
            else
4153
27.5k
              break;
4154
28.1k
            }
4155
770k
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4156
770k
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4157
2.36k
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4158
1.21k
            continue;
4159
769k
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4160
232k
            negate_class = TRUE;
4161
536k
          else break;
4162
798k
          }
4163
4164
        /* Now the real contents of the class; c has the first "real" character.
4165
        Empty classes are permitted only if the option is set, and if it's not
4166
        a Perl-extended class. */
4167
4168
564k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4169
564k
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4170
564k
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4171
21.7k
          {
4172
21.7k
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4173
4174
21.7k
          if (class_start != NULL)
4175
10.3k
            {
4176
10.3k
            PCRE2_ASSERT(class_depth_m1 >= 0);
4177
            /* Represents that the class is an extended class. */
4178
10.3k
            *class_start |= CLASS_IS_ECLASS;
4179
10.3k
            class_start = NULL;
4180
10.3k
            }
4181
4182
21.7k
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4183
4184
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4185
          very first (top-level) class being empty. */
4186
21.7k
          if (class_depth_m1 < 0) break;
4187
4188
11.5k
          class_range_state = RANGE_NO; /* for processing the containing class */
4189
11.5k
          class_op_state = CLASS_OP_OPERAND;
4190
11.5k
          goto CLASS_CONTINUE;
4191
21.7k
          }
4192
4193
        /* Enter a non-empty class. */
4194
4195
542k
        if (class_start != NULL)
4196
36.3k
          {
4197
36.3k
          PCRE2_ASSERT(class_depth_m1 >= 0);
4198
          /* Represents that the class is an extended class. */
4199
36.3k
          *class_start |= CLASS_IS_ECLASS;
4200
36.3k
          class_start = NULL;
4201
36.3k
          }
4202
4203
542k
        class_start = parsed_pattern;
4204
542k
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4205
542k
        class_range_state = RANGE_NO;
4206
542k
        class_op_state = CLASS_OP_EMPTY;
4207
542k
        class_mode_state = new_class_mode_state;
4208
542k
        ++class_depth_m1;
4209
542k
        if (class_maxdepth_m1 < class_depth_m1)
4210
505k
          class_maxdepth_m1 = class_depth_m1;
4211
        /* Reset; no op seen yet at new depth. */
4212
542k
        cb->class_op_used[class_depth_m1] = 0;
4213
4214
        /* Implement the special start-of-class literal meaning of ']'. */
4215
542k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4216
542k
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4217
23.9k
          {
4218
23.9k
          class_range_state = RANGE_OK_LITERAL;
4219
23.9k
          class_op_state = CLASS_OP_OPERAND;
4220
23.9k
          PARSED_LITERAL(c, parsed_pattern);
4221
23.9k
          goto CLASS_CONTINUE;
4222
23.9k
          }
4223
4224
518k
        continue;  /* We have already loaded c with the next character */
4225
542k
        }
4226
4227
      /* Check for the end of the class. */
4228
4229
7.27M
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4230
7.27M
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4231
540k
        {
4232
        /* In Perl extended mode, the ']' can only be used to match the
4233
        opening '[', and ')' must match an opening parenthesis. */
4234
540k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4235
133
          {
4236
133
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4237
3
            {
4238
3
            errorcode = ERR14;
4239
3
            goto FAILED_BACK;
4240
3
            }
4241
130
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4242
8
            {
4243
8
            errorcode = ERR22;
4244
8
            goto FAILED;
4245
8
            }
4246
130
          }
4247
4248
        /* Check no trailing operator. */
4249
540k
        if (class_op_state == CLASS_OP_OPERATOR)
4250
4
          {
4251
4
          errorcode = ERR110;
4252
4
          goto FAILED;
4253
4
          }
4254
4255
        /* Check no empty expression for Perl extended expressions. */
4256
540k
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4257
540k
            class_op_state == CLASS_OP_EMPTY)
4258
10
          {
4259
10
          errorcode = ERR114;
4260
10
          goto FAILED;
4261
10
          }
4262
4263
        /* -] at the end of a class is a literal '-' */
4264
540k
        if (class_range_state == RANGE_STARTED)
4265
24.4k
          parsed_pattern[-1] = CHAR_MINUS;
4266
4267
540k
        *parsed_pattern++ = META_CLASS_END;
4268
4269
540k
        if (--class_depth_m1 < 0)
4270
478k
          {
4271
          /* Check for and consume ')' after '(?[...]'. */
4272
478k
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4273
478k
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4274
73
            {
4275
73
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4276
6
              {
4277
6
              errorcode = ERR115;
4278
6
              goto FAILED;
4279
6
              }
4280
4281
67
            ptr++;
4282
67
            }
4283
4284
478k
          break;
4285
478k
          }
4286
4287
61.3k
        class_range_state = RANGE_NO; /* for processing the containing class */
4288
61.3k
        class_op_state = CLASS_OP_OPERAND;
4289
61.3k
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4290
658
          class_mode_state = CLASS_MODE_PERL_EXT;
4291
        /* The extended class flag has already
4292
        been set for the parent class. */
4293
61.3k
        class_start = NULL;
4294
61.3k
        }
4295
4296
      /* Handle a Perl set binary operator */
4297
4298
6.73M
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4299
6.73M
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4300
1.34k
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4301
768
        {
4302
        /* Check that there was a preceding operand. */
4303
768
        if (class_op_state != CLASS_OP_OPERAND)
4304
15
          {
4305
15
          errorcode = ERR109;
4306
15
          goto FAILED;
4307
15
          }
4308
4309
753
        if (class_start != NULL)
4310
30
          {
4311
30
          PCRE2_ASSERT(class_depth_m1 >= 0);
4312
          /* Represents that the class is an extended class. */
4313
30
          *class_start |= CLASS_IS_ECLASS;
4314
30
          class_start = NULL;
4315
30
          }
4316
4317
753
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4318
753
                     class_range_state != RANGE_FORBID_STARTED);
4319
4320
753
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4321
753
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4322
665
                            c == CHAR_MINUS? META_ECLASS_SUB :
4323
587
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4324
53
                            META_ECLASS_XOR;
4325
753
        class_range_state = RANGE_NO;
4326
753
        class_op_state = CLASS_OP_OPERATOR;
4327
753
        }
4328
4329
      /* Handle a Perl set unary operator */
4330
4331
6.73M
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4332
6.73M
               c == CHAR_EXCLAMATION_MARK)
4333
315
        {
4334
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4335
        start of the class, or follows an operator). */
4336
315
        if (class_op_state == CLASS_OP_OPERAND)
4337
5
          {
4338
5
          errorcode = ERR113;
4339
5
          goto FAILED;
4340
5
          }
4341
4342
310
        if (class_start != NULL)
4343
33
          {
4344
33
          PCRE2_ASSERT(class_depth_m1 >= 0);
4345
          /* Represents that the class is an extended class. */
4346
33
          *class_start |= CLASS_IS_ECLASS;
4347
33
          class_start = NULL;
4348
33
          }
4349
4350
310
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4351
310
                     class_range_state != RANGE_FORBID_STARTED);
4352
4353
310
        *parsed_pattern++ = META_ECLASS_NOT;
4354
310
        class_range_state = RANGE_NO;
4355
310
        class_op_state = CLASS_OP_OPERATOR;
4356
310
        }
4357
4358
      /* Handle a UTS#18 set operator */
4359
4360
6.73M
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4361
6.73M
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4362
1.88M
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4363
6.73M
               ptr < ptrend && *ptr == c)
4364
11.4k
        {
4365
11.4k
        ++ptr;
4366
4367
        /* Check there isn't a triple-repetition. */
4368
11.4k
        if (ptr < ptrend && *ptr == c)
4369
21
          {
4370
32.4k
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4371
21
          errorcode = ERR108;
4372
21
          goto FAILED;
4373
21
          }
4374
4375
        /* Check for a preceding operand. */
4376
11.4k
        if (class_op_state != CLASS_OP_OPERAND)
4377
4
          {
4378
4
          errorcode = ERR109;
4379
4
          goto FAILED;
4380
4
          }
4381
4382
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4383
11.4k
        if (cb->class_op_used[class_depth_m1] != 0 &&
4384
11.4k
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4385
1
          {
4386
1
          errorcode = ERR111;
4387
1
          goto FAILED;
4388
1
          }
4389
4390
11.4k
        if (class_start != NULL)
4391
2.35k
          {
4392
2.35k
          PCRE2_ASSERT(class_depth_m1 >= 0);
4393
          /* Represents that the class is an extended class. */
4394
2.35k
          *class_start |= CLASS_IS_ECLASS;
4395
2.35k
          class_start = NULL;
4396
2.35k
          }
4397
4398
        /* Dangling '-' before an operator is a literal */
4399
11.4k
        if (class_range_state == RANGE_STARTED)
4400
40
          parsed_pattern[-1] = CHAR_MINUS;
4401
4402
11.4k
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4403
11.4k
                            c == CHAR_MINUS? META_ECLASS_SUB :
4404
10.4k
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4405
10.3k
                            META_ECLASS_XOR;
4406
11.4k
        class_range_state = RANGE_NO;
4407
11.4k
        class_op_state = CLASS_OP_OPERATOR;
4408
11.4k
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4409
11.4k
        }
4410
4411
      /* Handle escapes in a class */
4412
4413
6.72M
      else if (c == CHAR_BACKSLASH)
4414
294k
        {
4415
294k
        tempptr = ptr;
4416
294k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4417
294k
          xoptions, cb->bracount, TRUE, cb);
4418
4419
294k
        if (errorcode != 0)
4420
61
          {
4421
61
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4422
61
              class_mode_state >= CLASS_MODE_PERL_EXT)
4423
61
            goto FAILED;
4424
0
          ptr = tempptr;
4425
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4426
0
            {
4427
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4428
0
            }
4429
0
          escape = 0;                 /* Treat as literal character */
4430
0
          }
4431
4432
294k
        switch(escape)
4433
294k
          {
4434
108k
          case 0:  /* Escaped character code point is in c */
4435
108k
          char_is_literal = FALSE;
4436
108k
          goto CLASS_LITERAL;      /* (a few lines above) */
4437
4438
2.61k
          case ESC_b:
4439
2.61k
          c = CHAR_BS;    /* \b is backspace in a class */
4440
2.61k
          char_is_literal = FALSE;
4441
2.61k
          goto CLASS_LITERAL;
4442
4443
277
          case ESC_k:
4444
277
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4445
277
          char_is_literal = FALSE;
4446
277
          goto CLASS_LITERAL;
4447
4448
136
          case ESC_Q:
4449
136
          inescq = TRUE;  /* Enter literal mode */
4450
136
          goto CLASS_CONTINUE;
4451
4452
892
          case ESC_E:     /* Ignore orphan \E */
4453
892
          goto CLASS_CONTINUE;
4454
4455
38
          case ESC_B:     /* Always an error in a class */
4456
70
          case ESC_R:
4457
88
          case ESC_X:
4458
88
          errorcode = ERR7;
4459
88
          ptr--;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4460
88
          goto FAILED;
4461
4462
12
          case ESC_N:     /* Not permitted by Perl either */
4463
12
          errorcode = ERR71;
4464
12
          goto FAILED;
4465
4466
10.1k
          case ESC_H:
4467
30.2k
          case ESC_h:
4468
34.8k
          case ESC_V:
4469
37.8k
          case ESC_v:
4470
37.8k
          *parsed_pattern++ = META_ESCAPE + escape;
4471
37.8k
          break;
4472
4473
          /* These escapes may be converted to Unicode property tests when
4474
          PCRE2_UCP is set. */
4475
4476
13.3k
          case ESC_d:
4477
26.3k
          case ESC_D:
4478
79.0k
          case ESC_s:
4479
95.0k
          case ESC_S:
4480
113k
          case ESC_w:
4481
123k
          case ESC_W:
4482
123k
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4483
123k
            xoptions);
4484
123k
          break;
4485
4486
          /* Explicit Unicode property matching */
4487
4488
5.63k
          case ESC_P:
4489
20.5k
          case ESC_p:
4490
20.5k
#ifdef SUPPORT_UNICODE
4491
20.5k
            {
4492
20.5k
            BOOL negated;
4493
20.5k
            uint16_t ptype = 0, pdata = 0;
4494
20.5k
            if (!get_ucp(&ptr, &negated, &ptype, &pdata, &errorcode, cb))
4495
36
              goto FAILED;
4496
4497
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4498
            get converted to the general characteristic L&. That is, upper,
4499
            lower, and title case letters are all conflated. */
4500
4501
20.5k
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4502
20.5k
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4503
193
              {
4504
193
              ptype = PT_LAMP;
4505
193
              pdata = 0;
4506
193
              }
4507
4508
20.5k
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4509
20.5k
            *parsed_pattern++ = META_ESCAPE + escape;
4510
20.5k
            *parsed_pattern++ = (ptype << 16) | pdata;
4511
20.5k
            }
4512
#else
4513
          errorcode = ERR45;
4514
          goto FAILED;
4515
#endif
4516
0
          break;  /* End \P and \p */
4517
4518
          /* All others are not allowed in a class */
4519
4520
0
          default:
4521
0
          PCRE2_DEBUG_UNREACHABLE();
4522
          /* Fall through */
4523
4524
20
          case ESC_A:
4525
47
          case ESC_Z:
4526
65
          case ESC_z:
4527
66
          case ESC_G:
4528
70
          case ESC_K:
4529
70
          case ESC_C:
4530
70
          errorcode = ERR7;
4531
70
          ptr--;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4532
70
          goto FAILED;
4533
294k
          }
4534
4535
        /* All the switch-cases above which end in "break" describe a set
4536
        of characters. None may start a range. */
4537
4538
        /* The second part of a range can be a single-character escape
4539
        sequence (detected above), but not any of the other escapes. Perl
4540
        treats a hyphen as a literal in such circumstances. However, in Perl's
4541
        warning mode, a warning is given, so PCRE now faults it, as it is
4542
        almost certainly a mistake on the user's part. */
4543
4544
181k
        if (class_range_state == RANGE_STARTED)
4545
9
          {
4546
9
          errorcode = ERR50;
4547
9
          goto FAILED;
4548
9
          }
4549
4550
        /* Perl gives a warning unless the hyphen following a multi-character
4551
        escape is the last character in the class. PCRE throws an error. */
4552
4553
181k
        if (class_range_state == RANGE_FORBID_STARTED)
4554
3
          {
4555
3
          ptr = class_range_forbid_ptr;
4556
3
          errorcode = ERR50;
4557
3
          goto FAILED;
4558
3
          }
4559
4560
        /* Disallow implicit union in Perl extended classes. */
4561
4562
181k
        if (class_op_state == CLASS_OP_OPERAND &&
4563
181k
            class_mode_state == CLASS_MODE_PERL_EXT)
4564
5
          {
4565
5
          errorcode = ERR113;
4566
5
          goto FAILED;
4567
5
          }
4568
4569
181k
        class_range_state = RANGE_FORBID_NO;
4570
181k
        class_op_state = CLASS_OP_OPERAND;
4571
181k
        }
4572
4573
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4574
      Perl extended class. */
4575
4576
6.43M
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4577
44
        {
4578
44
        errorcode = ERR116;
4579
44
        goto FAILED;
4580
44
        }
4581
4582
      /* Handle potential start of range */
4583
4584
6.43M
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4585
50.5k
        {
4586
50.5k
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4587
44.9k
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4588
50.5k
        class_range_state = RANGE_STARTED;
4589
50.5k
        }
4590
4591
      /* Handle forbidden start of range */
4592
4593
6.38M
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4594
152
        {
4595
152
        *parsed_pattern++ = CHAR_MINUS;
4596
152
        class_range_state = RANGE_FORBID_STARTED;
4597
152
        class_range_forbid_ptr = ptr;
4598
152
        }
4599
4600
      /* Handle a literal character */
4601
4602
6.38M
      else
4603
6.38M
        {
4604
6.49M
        CLASS_LITERAL:
4605
4606
        /* Disallow implicit union in Perl extended classes. */
4607
4608
6.49M
        if (class_op_state == CLASS_OP_OPERAND &&
4609
6.49M
            class_mode_state == CLASS_MODE_PERL_EXT)
4610
3
          {
4611
3
          errorcode = ERR113;
4612
3
          goto FAILED;
4613
3
          }
4614
4615
6.49M
        if (class_range_state == RANGE_STARTED)
4616
25.9k
          {
4617
25.9k
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4618
1.68k
            parsed_pattern--;
4619
24.2k
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4620
48
            {
4621
48
            errorcode = ERR8;
4622
48
            goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
4623
48
            }
4624
24.1k
          else
4625
24.1k
            {
4626
24.1k
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4627
473
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4628
24.1k
            PARSED_LITERAL(c, parsed_pattern);
4629
24.1k
            }
4630
25.8k
          class_range_state = RANGE_NO;
4631
25.8k
          class_op_state = CLASS_OP_OPERAND;
4632
25.8k
          }
4633
6.46M
        else if (class_range_state == RANGE_FORBID_STARTED)
4634
5
          {
4635
5
          ptr = class_range_forbid_ptr;
4636
5
          errorcode = ERR50;
4637
5
          goto FAILED;
4638
5
          }
4639
6.46M
        else  /* Potential start of range */
4640
6.46M
          {
4641
6.46M
          class_range_state = char_is_literal?
4642
6.35M
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4643
6.46M
          class_op_state = CLASS_OP_OPERAND;
4644
6.46M
          PARSED_LITERAL(c, parsed_pattern);
4645
6.46M
          }
4646
6.49M
        }
4647
4648
      /* Proceed to next thing in the class. */
4649
4650
6.88M
      CLASS_CONTINUE:
4651
6.88M
      if (ptr >= ptrend)
4652
793
        {
4653
793
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4654
7
          errorcode = ERR14;   /* Missing terminating ')' */
4655
793
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4656
793
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4657
40
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4658
753
        else
4659
753
          errorcode = ERR6;    /* Missing terminating ']' */
4660
793
        goto FAILED;
4661
793
        }
4662
6.88M
      GETCHARINCTEST(c, ptr);
4663
6.88M
      }     /* End of class-processing loop */
4664
4665
489k
    break;  /* End of character class */
4666
4667
4668
    /* ---- Opening parenthesis ---- */
4669
4670
489k
    case CHAR_LEFT_PARENTHESIS:
4671
456k
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4672
4673
    /* If ( is not followed by ? it is either a capture or a special verb or an
4674
    alpha assertion or a positive non-atomic lookahead. */
4675
4676
456k
    if (*ptr != CHAR_QUESTION_MARK)
4677
290k
      {
4678
290k
      const char *vn;
4679
4680
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4681
      off). */
4682
4683
290k
      if (*ptr != CHAR_ASTERISK)
4684
253k
        {
4685
253k
        nest_depth++;
4686
253k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4687
205k
          {
4688
205k
          if (cb->bracount >= MAX_GROUP_NUMBER)
4689
0
            {
4690
0
            errorcode = ERR97;
4691
0
            goto FAILED;
4692
0
            }
4693
205k
          cb->bracount++;
4694
205k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4695
205k
          }
4696
48.1k
        else *parsed_pattern++ = META_NOCAPTURE;
4697
253k
        }
4698
4699
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4700
      quantifier" error rather than "(*MARK) must have an argument". */
4701
4702
36.2k
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4703
10
        break;
4704
4705
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4706
      synonyms for the historical symbolic assertions, but the script run and
4707
      non-atomic lookaround ones are new. They are distinguished by starting
4708
      with a lower case letter. Checking both ends of the alphabet makes this
4709
      work in all character codes. */
4710
4711
36.2k
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4712
2.34k
        {
4713
2.34k
        uint32_t meta;
4714
4715
2.34k
        vn = alasnames;
4716
2.34k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4717
2.34k
          &errorcode, cb)) goto FAILED;
4718
2.34k
        if (ptr >= ptrend || *ptr != CHAR_COLON)
4719
20
          {
4720
20
          errorcode = ERR95;  /* Malformed */
4721
20
          goto FAILED;
4722
20
          }
4723
4724
        /* Scan the table of alpha assertion names */
4725
4726
32.1k
        for (i = 0; i < alascount; i++)
4727
32.1k
          {
4728
32.1k
          if (namelen == alasmeta[i].len &&
4729
32.1k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4730
2.31k
            break;
4731
29.8k
          vn += alasmeta[i].len + 1;
4732
29.8k
          }
4733
4734
2.32k
        if (i >= alascount)
4735
10
          {
4736
10
          errorcode = ERR95;  /* Alpha assertion not recognized */
4737
10
          goto FAILED;
4738
10
          }
4739
4740
        /* Check for expecting an assertion condition. If so, only atomic
4741
        lookaround assertions are valid. */
4742
4743
2.31k
        meta = alasmeta[i].meta;
4744
2.31k
        if (prev_expect_cond_assert > 0 &&
4745
2.31k
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4746
6
          {
4747
6
          errorcode = ERR28;  /* Atomic assertion expected */
4748
6
          goto FAILED;
4749
6
          }
4750
4751
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4752
        to the code that handles the traditional symbolic forms. */
4753
4754
2.30k
        switch(meta)
4755
2.30k
          {
4756
0
          default:
4757
0
          PCRE2_DEBUG_UNREACHABLE();
4758
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4759
0
          goto FAILED;        /* the meta values come from a table above. */
4760
4761
0
          case META_ATOMIC:
4762
0
          goto ATOMIC_GROUP;
4763
4764
83
          case META_LOOKAHEAD:
4765
83
          goto POSITIVE_LOOK_AHEAD;
4766
4767
66
          case META_LOOKAHEAD_NA:
4768
66
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4769
4770
66
          case META_LOOKAHEADNOT:
4771
66
          goto NEGATIVE_LOOK_AHEAD;
4772
4773
0
          case META_SCS:
4774
0
          ptr++;
4775
0
          *parsed_pattern++ = META_SCS;
4776
4777
0
          parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
4778
0
                                              0, &errorcode, cb);
4779
0
          if (parsed_pattern == NULL) goto FAILED;
4780
0
          goto POST_ASSERTION;
4781
4782
70
          case META_LOOKBEHIND:
4783
156
          case META_LOOKBEHINDNOT:
4784
222
          case META_LOOKBEHIND_NA:
4785
222
          *parsed_pattern++ = meta;
4786
222
          ptr--;
4787
222
          goto POST_LOOKBEHIND;
4788
4789
          /* The script run facilities are handled here. Unicode support is
4790
          required (give an error if not, as this is a security issue). Always
4791
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4792
          META_ATOMIC and remember that we need two META_KETs at the end. */
4793
4794
1.47k
          case META_SCRIPT_RUN:
4795
1.87k
          case META_ATOMIC_SCRIPT_RUN:
4796
1.87k
#ifdef SUPPORT_UNICODE
4797
1.87k
          *parsed_pattern++ = META_SCRIPT_RUN;
4798
1.87k
          nest_depth++;
4799
1.87k
          ptr++;
4800
1.87k
          if (meta == META_ATOMIC_SCRIPT_RUN)
4801
393
            {
4802
393
            *parsed_pattern++ = META_ATOMIC;
4803
393
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4804
90
            else if (++top_nest >= end_nests)
4805
0
              {
4806
0
              errorcode = ERR84;
4807
0
              goto FAILED;
4808
0
              }
4809
393
            top_nest->nest_depth = nest_depth;
4810
393
            top_nest->flags = NSF_ATOMICSR;
4811
393
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4812
393
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4813
4814
#ifdef PCRE2_DEBUG
4815
            /* We'll write out two META_KETs for a single ")" in the input
4816
            pattern, so we reserve space for that in our bounds check. */
4817
            parsed_pattern_extra++;
4818
#endif
4819
393
            }
4820
1.87k
          break;
4821
#else  /* SUPPORT_UNICODE */
4822
          errorcode = ERR96;
4823
          goto FAILED;
4824
#endif
4825
2.30k
          }
4826
2.30k
        }
4827
4828
4829
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4830
4831
33.8k
      else
4832
33.8k
        {
4833
33.8k
        vn = verbnames;
4834
33.8k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4835
33.8k
          &errorcode, cb)) goto FAILED;
4836
33.8k
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4837
33.8k
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4838
102
          {
4839
102
          errorcode = ERR60;  /* Malformed */
4840
102
          goto FAILED;
4841
102
          }
4842
4843
        /* Scan the table of verb names */
4844
4845
215k
        for (i = 0; i < verbcount; i++)
4846
215k
          {
4847
215k
          if (namelen == verbs[i].len &&
4848
215k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4849
33.7k
            break;
4850
182k
          vn += verbs[i].len + 1;
4851
182k
          }
4852
4853
33.7k
        if (i >= verbcount)
4854
29
          {
4855
29
          errorcode = ERR60;  /* Verb not recognized */
4856
29
          goto FAILED;
4857
29
          }
4858
4859
        /* An empty argument is treated as no argument. */
4860
4861
33.7k
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4862
33.7k
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4863
92
          ptr++;    /* Advance to the closing parens */
4864
4865
        /* Check for mandatory non-empty argument; this is (*MARK) */
4866
4867
33.7k
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4868
3
          {
4869
3
          errorcode = ERR66;
4870
3
          goto FAILED;
4871
3
          }
4872
4873
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4874
        for handling quantified (*ACCEPT). */
4875
4876
33.7k
        verbstartptr = parsed_pattern;
4877
33.7k
        okquantifier = (verbs[i].meta == META_ACCEPT);
4878
#ifdef PCRE2_DEBUG
4879
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4880
        with a non-capturing bracket, if there is a following quantifier. */
4881
        if (okquantifier) parsed_pattern_extra += 2;
4882
#endif
4883
4884
        /* It appears that Perl allows any characters whatsoever, other than a
4885
        closing parenthesis, to appear in arguments ("names"), so we no longer
4886
        insist on letters, digits, and underscores. Perl does not, however, do
4887
        any interpretation within arguments, and has no means of including a
4888
        closing parenthesis. PCRE supports escape processing but only when it
4889
        is requested by an option. We set inverbname TRUE here, and let the
4890
        main loop take care of this so that escape and \x processing is done by
4891
        the main code above. */
4892
4893
33.7k
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4894
8.92k
          {
4895
          /* Some optional arguments can be treated as a preceding (*MARK) */
4896
4897
8.92k
          if (verbs[i].has_arg < 0)
4898
1.11k
            {
4899
1.11k
            add_after_mark = verbs[i].meta;
4900
1.11k
            *parsed_pattern++ = META_MARK;
4901
1.11k
            }
4902
4903
          /* The remaining verbs with arguments (except *MARK) need a different
4904
          opcode. */
4905
4906
7.80k
          else
4907
7.80k
            {
4908
7.80k
            *parsed_pattern++ = verbs[i].meta +
4909
7.80k
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4910
7.80k
            }
4911
4912
          /* Set up for reading the name in the main loop. */
4913
4914
8.92k
          verblengthptr = parsed_pattern++;
4915
8.92k
          verbnamestart = ptr;
4916
8.92k
          inverbname = TRUE;
4917
8.92k
          }
4918
24.8k
        else  /* No verb "name" argument */
4919
24.8k
          {
4920
24.8k
          *parsed_pattern++ = verbs[i].meta;
4921
24.8k
          }
4922
33.7k
        }     /* End of (*VERB) handling */
4923
289k
      break;  /* Done with this parenthesis */
4924
290k
      }       /* End of groups that don't start with (? */
4925
4926
4927
    /* ---- Items starting (? ---- */
4928
4929
    /* The type of item is determined by what follows (?. Handle (?| and option
4930
    changes under "default" because both need a new block on the nest stack.
4931
    Comments starting with (?# are handled above. Note that there is some
4932
    ambiguity about the sequence (?- because if a digit follows it's a relative
4933
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4934
4935
166k
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4936
4937
166k
    switch(*ptr)
4938
166k
      {
4939
17.6k
      default:
4940
17.6k
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4941
82
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4942
4943
      /* We now have either (?| or a (possibly empty) option setting,
4944
      optionally followed by a non-capturing group. */
4945
4946
17.5k
      nest_depth++;
4947
17.5k
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4948
10.5k
      else if (++top_nest >= end_nests)
4949
0
        {
4950
0
        errorcode = ERR84;
4951
0
        goto FAILED;
4952
0
        }
4953
17.5k
      top_nest->nest_depth = nest_depth;
4954
17.5k
      top_nest->flags = 0;
4955
17.5k
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
4956
17.5k
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4957
4958
      /* Start of non-capturing group that resets the capture count for each
4959
      branch. */
4960
4961
17.5k
      if (*ptr == CHAR_VERTICAL_LINE)
4962
4.78k
        {
4963
4.78k
        top_nest->reset_group = (uint16_t)cb->bracount;
4964
4.78k
        top_nest->max_group = (uint16_t)cb->bracount;
4965
4.78k
        top_nest->flags |= NSF_RESET;
4966
4.78k
        cb->external_flags |= PCRE2_DUPCAPUSED;
4967
4.78k
        *parsed_pattern++ = META_NOCAPTURE;
4968
4.78k
        ptr++;
4969
4.78k
        }
4970
4971
      /* Scan for options imnrsxJU to be set or unset. */
4972
4973
12.7k
      else
4974
12.7k
        {
4975
12.7k
        BOOL hyphenok = TRUE;
4976
12.7k
        uint32_t oldoptions = options;
4977
12.7k
        uint32_t oldxoptions = xoptions;
4978
4979
12.7k
        top_nest->reset_group = 0;
4980
12.7k
        top_nest->max_group = 0;
4981
12.7k
        set = unset = 0;
4982
12.7k
        optset = &set;
4983
12.7k
        xset = xunset = 0;
4984
12.7k
        xoptset = &xset;
4985
4986
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
4987
4988
12.7k
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
4989
197
          {
4990
197
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
4991
197
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
4992
197
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
4993
197
          hyphenok = FALSE;
4994
197
          ptr++;
4995
197
          }
4996
4997
25.0k
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
4998
25.0k
                               *ptr != CHAR_COLON)
4999
12.3k
          {
5000
12.3k
          switch (*ptr++)
5001
12.3k
            {
5002
410
            case CHAR_MINUS:
5003
410
            if (!hyphenok)
5004
3
              {
5005
3
              errorcode = ERR94;
5006
3
              ptr--;  /* Correct the offset */
5007
3
              goto FAILED;
5008
3
              }
5009
407
            optset = &unset;
5010
407
            xoptset = &xunset;
5011
407
            hyphenok = FALSE;
5012
407
            break;
5013
5014
            /* There are some two-character sequences that start with 'a'. */
5015
5016
1.36k
            case CHAR_a:
5017
1.36k
            if (ptr < ptrend)
5018
1.35k
              {
5019
1.35k
              if (*ptr == CHAR_D)
5020
40
                {
5021
40
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
5022
40
                ptr++;
5023
40
                break;
5024
40
                }
5025
1.31k
              if (*ptr == CHAR_P)
5026
68
                {
5027
68
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
5028
68
                ptr++;
5029
68
                break;
5030
68
                }
5031
1.25k
              if (*ptr == CHAR_S)
5032
38
                {
5033
38
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
5034
38
                ptr++;
5035
38
                break;
5036
38
                }
5037
1.21k
              if (*ptr == CHAR_T)
5038
594
                {
5039
594
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
5040
594
                ptr++;
5041
594
                break;
5042
594
                }
5043
619
              if (*ptr == CHAR_W)
5044
72
                {
5045
72
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
5046
72
                ptr++;
5047
72
                break;
5048
72
                }
5049
619
              }
5050
551
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
5051
551
                        PCRE2_EXTRA_ASCII_BSW|
5052
551
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
5053
551
            break;
5054
5055
3.99k
            case CHAR_J:  /* Record that it changed in the external options */
5056
3.99k
            *optset |= PCRE2_DUPNAMES;
5057
3.99k
            cb->external_flags |= PCRE2_JCHANGED;
5058
3.99k
            break;
5059
5060
2.10k
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
5061
251
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
5062
345
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
5063
1.38k
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
5064
256
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
5065
1.28k
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
5066
5067
            /* If x appears twice it sets the extended extended option. */
5068
5069
817
            case CHAR_x:
5070
817
            *optset |= PCRE2_EXTENDED;
5071
817
            if (ptr < ptrend && *ptr == CHAR_x)
5072
204
              {
5073
204
              *optset |= PCRE2_EXTENDED_MORE;
5074
204
              ptr++;
5075
204
              }
5076
817
            break;
5077
5078
165
            default:
5079
165
            errorcode = ERR11;
5080
165
            ptr--;    /* Correct the offset */
5081
165
            goto FAILED;
5082
12.3k
            }
5083
12.3k
          }
5084
5085
        /* If we are setting extended without extended-more, ensure that any
5086
        existing extended-more gets unset. Also, unsetting extended must also
5087
        unset extended-more. */
5088
5089
12.6k
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5090
12.6k
            (unset & PCRE2_EXTENDED) != 0)
5091
585
          unset |= PCRE2_EXTENDED_MORE;
5092
5093
12.6k
        options = (options | set) & (~unset);
5094
12.6k
        xoptions = (xoptions | xset) & (~xunset);
5095
5096
        /* If the options ended with ')' this is not the start of a nested
5097
        group with option changes, so the options change at this level.
5098
        In this case, if the previous level set up a nest block, discard the
5099
        one we have just created. Otherwise adjust it for the previous level.
5100
        If the options ended with ':' we are starting a non-capturing group,
5101
        possibly with an options setting. */
5102
5103
12.6k
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5104
12.6k
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5105
10.3k
          {
5106
10.3k
          nest_depth--;  /* This is not a nested group after all. */
5107
10.3k
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5108
10.3k
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5109
4.57k
          else top_nest->nest_depth = nest_depth;
5110
10.3k
          }
5111
2.24k
        else *parsed_pattern++ = META_NOCAPTURE;
5112
5113
        /* If nothing changed, no need to record. */
5114
5115
12.6k
        if (options != oldoptions || xoptions != oldxoptions)
5116
3.80k
          {
5117
3.80k
          *parsed_pattern++ = META_OPTIONS;
5118
3.80k
          *parsed_pattern++ = options;
5119
3.80k
          *parsed_pattern++ = xoptions;
5120
3.80k
          }
5121
12.6k
        }     /* End options processing */
5122
17.3k
      break;  /* End default case after (? */
5123
5124
5125
      /* ---- Python syntax support ---- */
5126
5127
17.3k
      case CHAR_P:
5128
559
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5129
5130
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5131
5132
552
      if (*ptr == CHAR_LESS_THAN_SIGN)
5133
10
        {
5134
10
        terminator = CHAR_GREATER_THAN_SIGN;
5135
10
        goto DEFINE_NAME;
5136
10
        }
5137
5138
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5139
      call. */
5140
5141
542
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5142
5143
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5144
      else after (?P is an error. */
5145
5146
146
      if (*ptr != CHAR_EQUALS_SIGN)
5147
7
        {
5148
7
        errorcode = ERR41;
5149
7
        goto FAILED;
5150
7
        }
5151
139
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5152
139
          &namelen, &errorcode, cb)) goto FAILED;
5153
136
      *parsed_pattern++ = META_BACKREF_BYNAME;
5154
136
      *parsed_pattern++ = namelen;
5155
136
      PUTOFFSET(offset, parsed_pattern);
5156
136
      okquantifier = TRUE;
5157
136
      break;   /* End of (?P processing */
5158
5159
5160
      /* ---- Recursion/subroutine calls by number ---- */
5161
5162
10.7k
      case CHAR_R:
5163
10.7k
      i = 0;         /* (?R) == (?R0) */
5164
10.7k
      ptr++;
5165
10.7k
      if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))
5166
8
        {
5167
8
        errorcode = ERR58;
5168
8
        goto FAILED;
5169
8
        }
5170
10.7k
      terminator = CHAR_NUL;
5171
10.7k
      goto SET_RECURSION;
5172
5173
      /* An item starting (?- followed by a digit comes here via the "default"
5174
      case because (?- followed by a non-digit is an options setting. */
5175
5176
19.0k
      case CHAR_PLUS:
5177
19.0k
      if (ptrend - ptr < 2 || !IS_DIGIT(ptr[1]))
5178
10
        {
5179
10
        errorcode = ERR29;   /* Missing number */
5180
10
        goto FAILED;
5181
10
        }
5182
      /* Fall through */
5183
5184
34.8k
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5185
35.9k
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5186
36.0k
      RECURSION_BYNUMBER:
5187
36.0k
      if (!read_number(&ptr, ptrend,
5188
36.0k
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5189
36.0k
          MAX_GROUP_NUMBER, ERR61,
5190
36.0k
          &i, &errorcode)) goto FAILED;
5191
35.9k
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5192
35.9k
      terminator = CHAR_NUL;
5193
5194
46.8k
      SET_RECURSION:
5195
46.8k
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5196
46.8k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5197
      /* End of recursive call by number handling */
5198
46.8k
      goto READ_RECURSION_ARGUMENTS;
5199
5200
5201
      /* ---- Recursion/subroutine calls by name ---- */
5202
5203
77
      case CHAR_AMPERSAND:
5204
473
      RECURSE_BY_NAME:
5205
473
      if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,
5206
473
          &namelen, &errorcode, cb)) goto FAILED;
5207
469
      *parsed_pattern++ = META_RECURSE_BYNAME;
5208
469
      *parsed_pattern++ = namelen;
5209
469
      terminator = CHAR_NUL;
5210
5211
47.3k
      READ_RECURSION_ARGUMENTS:
5212
47.3k
      PUTOFFSET(offset, parsed_pattern);
5213
47.3k
      okquantifier = TRUE;
5214
5215
      /* Arguments are not supported for \g construct. */
5216
47.3k
      if (terminator != CHAR_NUL) break;
5217
5218
47.1k
      if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)
5219
517
        {
5220
517
        parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
5221
517
                                            offset, &errorcode, cb);
5222
517
        if (parsed_pattern == NULL) goto FAILED;
5223
517
        }
5224
5225
47.1k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5226
36
        goto UNCLOSED_PARENTHESIS;
5227
5228
47.0k
      ptr++;
5229
47.0k
      break;
5230
5231
      /* ---- Callout with numerical or string argument ---- */
5232
5233
8.19k
      case CHAR_C:
5234
8.19k
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5235
0
        {
5236
0
        errorcode = ERR103;
5237
0
        goto FAILED;
5238
0
        }
5239
5240
8.19k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5241
5242
      /* If the previous item was a condition starting (?(? an assertion,
5243
      optionally preceded by a callout, is expected. This is checked later on,
5244
      during actual compilation. However we need to identify this kind of
5245
      assertion in this pass because it must not be qualified. The value of
5246
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5247
      for a callout - still leaving a positive value that identifies the
5248
      assertion. Multiple callouts or any other items will make it zero or
5249
      less, which doesn't matter because they will cause an error later. */
5250
5251
8.18k
      expect_cond_assert = prev_expect_cond_assert - 1;
5252
5253
      /* If previous_callout is not NULL, it means this follows a previous
5254
      callout. If it was a manual callout, do nothing; this means its "length
5255
      of next pattern item" field will remain zero. If it was an automatic
5256
      callout, abolish it. */
5257
5258
8.18k
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5259
8.18k
          previous_callout == parsed_pattern - 4 &&
5260
8.18k
          parsed_pattern[-1] == 255)
5261
910
        parsed_pattern = previous_callout;
5262
5263
      /* Save for updating next pattern item length, and skip one item before
5264
      completing. */
5265
5266
8.18k
      previous_callout = parsed_pattern;
5267
8.18k
      after_manual_callout = 1;
5268
5269
      /* Handle a string argument; specific delimiter is required. */
5270
5271
8.18k
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5272
7.07k
        {
5273
7.07k
        PCRE2_SIZE calloutlength;
5274
7.07k
        PCRE2_SPTR startptr = ptr;
5275
5276
7.07k
        delimiter = 0;
5277
39.1k
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5278
39.1k
          {
5279
39.1k
          if (*ptr == PRIV(callout_start_delims)[i])
5280
7.07k
            {
5281
7.07k
            delimiter = PRIV(callout_end_delims)[i];
5282
7.07k
            break;
5283
7.07k
            }
5284
39.1k
          }
5285
7.07k
        if (delimiter == 0)
5286
6
          {
5287
6
          errorcode = ERR82;
5288
6
          goto FAILED;
5289
6
          }
5290
5291
7.07k
        *parsed_pattern = META_CALLOUT_STRING;
5292
7.07k
        parsed_pattern += 3;   /* Skip pattern info */
5293
5294
7.07k
        for (;;)
5295
61.1k
          {
5296
61.1k
          if (++ptr >= ptrend)
5297
14
            {
5298
14
            errorcode = ERR81;
5299
14
            ptr = startptr;   /* To give a more useful message */
5300
14
            goto FAILED;
5301
14
            }
5302
61.1k
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5303
7.05k
            break;
5304
61.1k
          }
5305
5306
7.05k
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5307
7.05k
        if (calloutlength > UINT32_MAX)
5308
0
          {
5309
0
          errorcode = ERR72;
5310
0
          goto FAILED;
5311
0
          }
5312
7.05k
        *parsed_pattern++ = (uint32_t)calloutlength;
5313
7.05k
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5314
7.05k
        PUTOFFSET(offset, parsed_pattern);
5315
7.05k
        }
5316
5317
      /* Handle a callout with an optional numerical argument, which must be
5318
      less than or equal to 255. A missing argument gives 0. */
5319
5320
1.10k
      else
5321
1.10k
        {
5322
1.10k
        int n = 0;
5323
1.10k
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5324
1.10k
        parsed_pattern += 3;                       /* Skip pattern info */
5325
1.48k
        while (ptr < ptrend && IS_DIGIT(*ptr))
5326
385
          {
5327
385
          n = n * 10 + (*ptr++ - CHAR_0);
5328
385
          if (n > 255)
5329
5
            {
5330
5
            errorcode = ERR38;
5331
5
            goto FAILED;
5332
5
            }
5333
385
          }
5334
1.10k
        *parsed_pattern++ = n;
5335
1.10k
        }
5336
5337
      /* Both formats must have a closing parenthesis */
5338
5339
8.15k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5340
42
        {
5341
42
        errorcode = ERR39;
5342
42
        goto FAILED;
5343
42
        }
5344
8.11k
      ptr++;
5345
5346
      /* Remember the offset to the next item in the pattern, and set a default
5347
      length. This should get updated after the next item is read. */
5348
5349
8.11k
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5350
8.11k
      previous_callout[2] = 0;
5351
8.11k
      break;                  /* End callout */
5352
5353
5354
      /* ---- Conditional group ---- */
5355
5356
      /* A condition can be an assertion, a number (referring to a numbered
5357
      group's having been set), a name (referring to a named group), or 'R',
5358
      referring to overall recursion. R<digits> and R&name are also permitted
5359
      for recursion state tests. Numbers may be preceded by + or - to specify a
5360
      relative group number.
5361
5362
      There are several syntaxes for testing a named group: (?(name)) is used
5363
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5364
5365
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5366
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5367
      the Perl DEFINE feature or the Python named test. We look for a name
5368
      first; if not found, we try the other case.
5369
5370
      For compatibility with auto-callouts, we allow a callout to be specified
5371
      before a condition that is an assertion. */
5372
5373
19.0k
      case CHAR_LEFT_PARENTHESIS:
5374
19.0k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5375
19.0k
      nest_depth++;
5376
5377
      /* If the next character is ? or * there must be an assertion next
5378
      (optionally preceded by a callout). We do not check this here, but
5379
      instead we set expect_cond_assert to 2. If this is still greater than
5380
      zero (callouts decrement it) when the next assertion is read, it will be
5381
      marked as a condition that must not be repeated. A value greater than
5382
      zero also causes checking that an assertion (possibly with callout)
5383
      follows. */
5384
5385
19.0k
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5386
12.1k
        {
5387
12.1k
        *parsed_pattern++ = META_COND_ASSERT;
5388
12.1k
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5389
12.1k
        expect_cond_assert = 2;
5390
12.1k
        break;  /* End of conditional */
5391
12.1k
        }
5392
5393
      /* Handle (?([+-]number)... */
5394
5395
6.83k
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5396
6.83k
          &errorcode))
5397
419
        {
5398
419
        PCRE2_ASSERT(i >= 0);
5399
419
        if (i <= 0)
5400
3
          {
5401
3
          errorcode = ERR15;
5402
3
          goto FAILED;
5403
3
          }
5404
416
        *parsed_pattern++ = META_COND_NUMBER;
5405
416
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5406
416
        PUTOFFSET(offset, parsed_pattern);
5407
416
        *parsed_pattern++ = i;
5408
416
        }
5409
6.41k
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5410
5411
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5412
5413
6.41k
      else if (ptrend - ptr >= 10 &&
5414
6.41k
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5415
6.41k
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5416
750
        {
5417
750
        uint32_t ge = 0;
5418
750
        int major = 0;
5419
750
        int minor = 0;
5420
5421
750
        ptr += 7;
5422
750
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5423
278
          {
5424
278
          ge = 1;
5425
278
          ptr++;
5426
278
          }
5427
5428
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5429
        references its argument twice. */
5430
5431
750
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5432
7
          goto BAD_VERSION_CONDITION;
5433
5434
743
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5435
3
          goto FAILED;
5436
5437
740
        if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5438
737
        if (*ptr == CHAR_DOT)
5439
84
          {
5440
84
          if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;
5441
75
          minor = (*ptr++ - CHAR_0) * 10;
5442
75
          if (ptr >= ptrend) goto BAD_VERSION_CONDITION;
5443
72
          if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;
5444
72
          if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5445
6
            goto BAD_VERSION_CONDITION;
5446
72
          }
5447
5448
719
        *parsed_pattern++ = META_COND_VERSION;
5449
719
        *parsed_pattern++ = ge;
5450
719
        *parsed_pattern++ = major;
5451
719
        *parsed_pattern++ = minor;
5452
719
        }
5453
5454
      /* All the remaining cases now require us to read a name. We cannot at
5455
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5456
      recursion test by number or a name, because the named groups have not yet
5457
      all been identified. Those cases are treated as names, but given a
5458
      different META code. */
5459
5460
5.66k
      else
5461
5.66k
        {
5462
5.66k
        BOOL was_r_ampersand = FALSE;
5463
5464
5.66k
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5465
1.41k
          {
5466
1.41k
          terminator = CHAR_RIGHT_PARENTHESIS;
5467
1.41k
          was_r_ampersand = TRUE;
5468
1.41k
          ptr++;
5469
1.41k
          }
5470
4.24k
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5471
137
          terminator = CHAR_GREATER_THAN_SIGN;
5472
4.11k
        else if (*ptr == CHAR_APOSTROPHE)
5473
13
          terminator = CHAR_APOSTROPHE;
5474
4.09k
        else
5475
4.09k
          {
5476
4.09k
          terminator = CHAR_RIGHT_PARENTHESIS;
5477
4.09k
          ptr--;   /* Point to char before name */
5478
4.09k
          }
5479
5.66k
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5480
5.66k
            &errorcode, cb)) goto FAILED;
5481
5482
        /* Handle (?(R&name) */
5483
5484
5.58k
        if (was_r_ampersand)
5485
1.41k
          {
5486
1.41k
          *parsed_pattern = META_COND_RNAME;
5487
1.41k
          ptr--;   /* Back to closing parens */
5488
1.41k
          }
5489
5490
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5491
        special code. Likewise if the name consists of R followed only by
5492
        digits. Otherwise, handle it like a quoted name. */
5493
5494
4.17k
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5495
4.03k
          {
5496
4.03k
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5497
0
            *parsed_pattern = META_COND_DEFINE;
5498
4.03k
          else
5499
4.03k
            {
5500
5.12k
            for (i = 1; i < (int)namelen; i++)
5501
1.19k
              if (!IS_DIGIT(name[i])) break;
5502
4.03k
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5503
3.02k
              META_COND_RNUMBER : META_COND_NAME;
5504
4.03k
            }
5505
4.03k
          ptr--;   /* Back to closing parens */
5506
4.03k
          }
5507
5508
        /* Handle (?('name') or (?(<name>) */
5509
5510
140
        else *parsed_pattern = META_COND_NAME;
5511
5512
        /* All these cases except DEFINE end with the name length and offset;
5513
        DEFINE just has an offset (for the "too many branches" error). */
5514
5515
5.58k
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5516
5.58k
        PUTOFFSET(offset, parsed_pattern);
5517
5.58k
        }  /* End cases that read a name */
5518
5519
      /* Check the closing parenthesis of the condition */
5520
5521
6.72k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5522
12
        {
5523
12
        errorcode = ERR24;
5524
12
        goto FAILED;
5525
12
        }
5526
6.71k
      ptr++;
5527
6.71k
      break;  /* End of condition processing */
5528
5529
5530
      /* ---- Atomic group ---- */
5531
5532
3.89k
      case CHAR_GREATER_THAN_SIGN:
5533
3.89k
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5534
3.89k
      *parsed_pattern++ = META_ATOMIC;
5535
3.89k
      nest_depth++;
5536
3.89k
      ptr++;
5537
3.89k
      break;
5538
5539
5540
      /* ---- Lookahead assertions ---- */
5541
5542
17.5k
      case CHAR_EQUALS_SIGN:
5543
17.5k
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5544
17.5k
      *parsed_pattern++ = META_LOOKAHEAD;
5545
17.5k
      ptr++;
5546
17.5k
      goto POST_ASSERTION;
5547
5548
8.20k
      case CHAR_ASTERISK:
5549
8.26k
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5550
8.26k
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5551
8.26k
      ptr++;
5552
8.26k
      goto POST_ASSERTION;
5553
5554
9.82k
      case CHAR_EXCLAMATION_MARK:
5555
9.89k
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5556
9.89k
      *parsed_pattern++ = META_LOOKAHEADNOT;
5557
9.89k
      ptr++;
5558
9.89k
      goto POST_ASSERTION;
5559
5560
5561
      /* ---- Lookbehind assertions ---- */
5562
5563
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5564
      is the start of the name of a capturing group. */
5565
5566
22.9k
      case CHAR_LESS_THAN_SIGN:
5567
22.9k
      if (ptrend - ptr <= 1 ||
5568
22.9k
         (ptr[1] != CHAR_EQUALS_SIGN &&
5569
22.8k
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5570
22.8k
          ptr[1] != CHAR_ASTERISK))
5571
2.86k
        {
5572
2.86k
        terminator = CHAR_GREATER_THAN_SIGN;
5573
2.86k
        goto DEFINE_NAME;
5574
2.86k
        }
5575
20.0k
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5576
13.9k
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5577
10.0k
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5578
5579
20.2k
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5580
20.2k
      *has_lookbehind = TRUE;
5581
20.2k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5582
20.2k
      PUTOFFSET(offset, parsed_pattern);
5583
20.2k
      ptr += 2;
5584
      /* Fall through */
5585
5586
      /* If the previous item was a condition starting (?(? an assertion,
5587
      optionally preceded by a callout, is expected. This is checked later on,
5588
      during actual compilation. However we need to identify this kind of
5589
      assertion in this pass because it must not be qualified. The value of
5590
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5591
      for a callout - still leaving a positive value that identifies the
5592
      assertion. Multiple callouts or any other items will make it zero or
5593
      less, which doesn't matter because they will cause an error later. */
5594
5595
56.0k
      POST_ASSERTION:
5596
56.0k
      nest_depth++;
5597
56.0k
      if (prev_expect_cond_assert > 0)
5598
12.1k
        {
5599
12.1k
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5600
1.56k
        else if (++top_nest >= end_nests)
5601
0
          {
5602
0
          errorcode = ERR84;
5603
0
          goto FAILED;
5604
0
          }
5605
12.1k
        top_nest->nest_depth = nest_depth;
5606
12.1k
        top_nest->flags = NSF_CONDASSERT;
5607
12.1k
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5608
12.1k
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5609
12.1k
        }
5610
56.0k
      break;
5611
5612
5613
      /* ---- Define a named group ---- */
5614
5615
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5616
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5617
      terminator set to '>'. */
5618
5619
56.0k
      case CHAR_APOSTROPHE:
5620
11.5k
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5621
5622
14.4k
      DEFINE_NAME:
5623
14.4k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5624
14.4k
          &errorcode, cb)) goto FAILED;
5625
5626
      /* We have a name for this capturing group. It is also assigned a number,
5627
      which is its primary means of identification. */
5628
5629
14.3k
      if (cb->bracount >= MAX_GROUP_NUMBER)
5630
0
        {
5631
0
        errorcode = ERR97;
5632
0
        goto FAILED;
5633
0
        }
5634
14.3k
      cb->bracount++;
5635
14.3k
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5636
14.3k
      nest_depth++;
5637
5638
      /* Check not too many names */
5639
5640
14.3k
      if (cb->names_found >= MAX_NAME_COUNT)
5641
0
        {
5642
0
        errorcode = ERR49;
5643
0
        goto FAILED;
5644
0
        }
5645
5646
      /* Adjust the entry size to accommodate the longest name found. */
5647
5648
14.3k
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5649
1.66k
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5650
5651
      /* Scan the list to check for duplicates. For duplicate names, if the
5652
      number is the same, break the loop, which causes the name to be
5653
      discarded; otherwise, if DUPNAMES is not set, give an error.
5654
      If it is set, allow the name with a different number, but continue
5655
      scanning in case this is a duplicate with the same number. For
5656
      non-duplicate names, give an error if the number is duplicated. */
5657
5658
14.3k
      is_dupname = FALSE;
5659
14.3k
      hash = PRIV(compile_get_hash_from_name)(name, namelen);
5660
14.3k
      ng = cb->named_groups;
5661
24.9k
      for (i = 0; i < cb->names_found; i++, ng++)
5662
22.3k
        {
5663
22.3k
        if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&
5664
22.3k
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5665
11.7k
          {
5666
          /* When a bracket is referenced by the same name multiple
5667
          times, is not considered as a duplicate and ignored. */
5668
11.7k
          if (ng->number == cb->bracount) break;
5669
11.7k
          if ((options & PCRE2_DUPNAMES) == 0)
5670
182
            {
5671
182
            errorcode = ERR43;
5672
182
            goto FAILED;
5673
182
            }
5674
5675
11.5k
          ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;
5676
11.5k
          is_dupname = TRUE;                /* Mark as a duplicate */
5677
11.5k
          cb->dupnames = TRUE;              /* Duplicate names exist */
5678
5679
          /* The entry represents a duplicate. */
5680
11.5k
          name = ng->name;
5681
11.5k
          namelen = 0;
5682
5683
          /* Even duplicated names may refer to the same
5684
          capture index. These references are also ignored. */
5685
552k
          for (; i < cb->names_found; i++, ng++)
5686
541k
            if (ng->name == name && ng->number == cb->bracount)
5687
323
              break;
5688
11.5k
          break;
5689
11.7k
          }
5690
10.5k
        else if (ng->number == cb->bracount)
5691
3
          {
5692
3
          errorcode = ERR65;
5693
3
          goto FAILED;
5694
3
          }
5695
22.3k
        }
5696
5697
      /* Ignore duplicate with same number. */
5698
14.1k
      if (i < cb->names_found) break;
5699
5700
      /* Increase the list size if necessary */
5701
5702
13.7k
      if (cb->names_found >= cb->named_group_list_size)
5703
259
        {
5704
259
        uint32_t newsize = cb->named_group_list_size * 2;
5705
259
        named_group *newspace =
5706
259
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5707
259
          cb->cx->memctl.memory_data);
5708
259
        if (newspace == NULL)
5709
0
          {
5710
0
          errorcode = ERR21;
5711
0
          goto FAILED;
5712
0
          }
5713
5714
259
        memcpy(newspace, cb->named_groups,
5715
259
          cb->named_group_list_size * sizeof(named_group));
5716
259
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5717
103
          cb->cx->memctl.free((void *)cb->named_groups,
5718
103
          cb->cx->memctl.memory_data);
5719
259
        cb->named_groups = newspace;
5720
259
        cb->named_group_list_size = newsize;
5721
259
        }
5722
5723
      /* Add this name to the list */
5724
13.7k
      if (is_dupname)
5725
11.2k
        hash |= NAMED_GROUP_IS_DUPNAME;
5726
5727
13.7k
      cb->named_groups[cb->names_found].name = name;
5728
13.7k
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5729
13.7k
      cb->named_groups[cb->names_found].number = cb->bracount;
5730
13.7k
      cb->named_groups[cb->names_found].hash_dup = hash;
5731
13.7k
      cb->names_found++;
5732
13.7k
      break;
5733
5734
5735
      /* ---- Perl extended character class ---- */
5736
5737
      /* These are of the form '(?[...])'. We handle these via the same parser
5738
      that consumes ordinary '[...]' classes, but with a flag set to activate
5739
      the extended behaviour. */
5740
5741
238
      case CHAR_LEFT_SQUARE_BRACKET:
5742
238
      class_mode_state = CLASS_MODE_PERL_EXT;
5743
238
      c = *ptr++;
5744
238
      goto FROM_PERL_EXTENDED_CLASS;
5745
166k
      }        /* End of (? switch */
5746
165k
    break;     /* End of ( handling */
5747
5748
5749
    /* ---- Branch terminators ---- */
5750
5751
    /* Alternation: reset the capture count if we are in a (?| group. */
5752
5753
800k
    case CHAR_VERTICAL_LINE:
5754
800k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5755
800k
        (top_nest->flags & NSF_RESET) != 0)
5756
3.51k
      {
5757
3.51k
      if (cb->bracount > top_nest->max_group)
5758
356
        top_nest->max_group = (uint16_t)cb->bracount;
5759
3.51k
      cb->bracount = top_nest->reset_group;
5760
3.51k
      }
5761
800k
    *parsed_pattern++ = META_ALT;
5762
800k
    break;
5763
5764
    /* End of group; reset the capture count to the maximum if we are in a (?|
5765
    group and/or reset the options that are tracked during parsing. Disallow
5766
    quantifier for a condition that is an assertion. */
5767
5768
342k
    case CHAR_RIGHT_PARENTHESIS:
5769
342k
    okquantifier = TRUE;
5770
342k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5771
21.3k
      {
5772
21.3k
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5773
21.3k
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5774
21.3k
      if ((top_nest->flags & NSF_RESET) != 0 &&
5775
21.3k
          top_nest->max_group > cb->bracount)
5776
253
        cb->bracount = top_nest->max_group;
5777
21.3k
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5778
11.8k
        okquantifier = FALSE;
5779
5780
21.3k
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5781
376
        {
5782
376
        *parsed_pattern++ = META_KET;
5783
5784
#ifdef PCRE2_DEBUG
5785
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5786
        parsed_pattern_extra--;
5787
#endif
5788
376
        }
5789
5790
21.3k
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5791
5.12k
        else top_nest--;
5792
21.3k
      }
5793
342k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5794
360
      {
5795
360
      errorcode = ERR22;
5796
360
      goto FAILED_BACK;  // TODO https://github.com/PCRE2Project/pcre2/issues/549
5797
360
      }
5798
341k
    nest_depth--;
5799
341k
    *parsed_pattern++ = META_KET;
5800
341k
    break;
5801
15.8M
    }  /* End of switch on pattern character */
5802
15.8M
  }    /* End of main character scan loop */
5803
5804
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5805
5806
57.6k
if (inverbname && ptr >= ptrend)
5807
104
  {
5808
104
  errorcode = ERR60;
5809
104
  goto FAILED;
5810
104
  }
5811
5812
5813
57.5k
PARSED_END:
5814
5815
57.5k
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5816
57.5k
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5817
57.5k
               max_parsed_pattern(ptr_check, ptr, utf, options));
5818
5819
/* Manage callout for the final item */
5820
5821
57.5k
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5822
57.5k
  parsed_pattern, cb);
5823
5824
/* Insert trailing items for word and line matching (features provided for the
5825
benefit of pcre2grep). */
5826
5827
57.5k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5828
0
  {
5829
0
  *parsed_pattern++ = META_KET;
5830
0
  *parsed_pattern++ = META_DOLLAR;
5831
0
  }
5832
57.5k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5833
0
  {
5834
0
  *parsed_pattern++ = META_KET;
5835
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5836
0
  }
5837
5838
/* Terminate the parsed pattern, then return success if all groups are closed.
5839
Otherwise we have unclosed parentheses. */
5840
5841
57.5k
if (parsed_pattern >= parsed_pattern_end)
5842
0
  {
5843
0
  PCRE2_DEBUG_UNREACHABLE();
5844
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5845
0
  goto FAILED;
5846
0
  }
5847
5848
57.5k
*parsed_pattern = META_END;
5849
57.5k
if (nest_depth == 0) return 0;
5850
5851
1.13k
UNCLOSED_PARENTHESIS:
5852
1.13k
errorcode = ERR14;
5853
5854
/* Come here for all failures. */
5855
5856
5.37k
FAILED:
5857
5.37k
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5858
5.37k
return errorcode;
5859
5860
/* Some errors need to indicate the previous character. */
5861
5862
608
FAILED_BACK:
5863
608
ptr--;
5864
608
goto FAILED;
5865
5866
/* This failure happens several times. */
5867
5868
28
BAD_VERSION_CONDITION:
5869
28
errorcode = ERR79;
5870
28
goto FAILED;
5871
1.13k
}
5872
5873
5874
5875
/*************************************************
5876
*       Find first significant opcode            *
5877
*************************************************/
5878
5879
/* This is called by several functions that scan a compiled expression looking
5880
for a fixed first character, or an anchoring opcode etc. It skips over things
5881
that do not influence this. For some calls, it makes sense to skip negative
5882
forward and all backward assertions, and also the \b assertion; for others it
5883
does not.
5884
5885
Arguments:
5886
  code         pointer to the start of the group
5887
  skipassert   TRUE if certain assertions are to be skipped
5888
5889
Returns:       pointer to the first significant opcode
5890
*/
5891
5892
static const PCRE2_UCHAR*
5893
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5894
137k
{
5895
137k
for (;;)
5896
163k
  {
5897
163k
  switch ((int)*code)
5898
163k
    {
5899
866
    case OP_ASSERT_NOT:
5900
1.46k
    case OP_ASSERTBACK:
5901
2.95k
    case OP_ASSERTBACK_NOT:
5902
4.10k
    case OP_ASSERTBACK_NA:
5903
4.10k
    if (!skipassert) return code;
5904
8.43k
    do code += GET(code, 1); while (*code == OP_ALT);
5905
2.98k
    code += PRIV(OP_lengths)[*code];
5906
2.98k
    break;
5907
5908
908
    case OP_WORD_BOUNDARY:
5909
1.25k
    case OP_NOT_WORD_BOUNDARY:
5910
1.39k
    case OP_UCP_WORD_BOUNDARY:
5911
1.57k
    case OP_NOT_UCP_WORD_BOUNDARY:
5912
1.57k
    if (!skipassert) return code;
5913
    /* Fall through */
5914
5915
21.8k
    case OP_CALLOUT:
5916
21.8k
    case OP_CREF:
5917
21.8k
    case OP_DNCREF:
5918
21.8k
    case OP_RREF:
5919
21.8k
    case OP_DNRREF:
5920
21.8k
    case OP_FALSE:
5921
21.8k
    case OP_TRUE:
5922
21.8k
    code += PRIV(OP_lengths)[*code];
5923
21.8k
    break;
5924
5925
325
    case OP_CALLOUT_STR:
5926
325
    code += GET(code, 1 + 2*LINK_SIZE);
5927
325
    break;
5928
5929
154
    case OP_SKIPZERO:
5930
154
    code += 2 + GET(code, 2) + LINK_SIZE;
5931
154
    break;
5932
5933
691
    case OP_COND:
5934
724
    case OP_SCOND:
5935
724
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
5936
724
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
5937
698
      return code;
5938
26
    code += GET(code, 1) + 1 + LINK_SIZE;
5939
26
    break;
5940
5941
130
    case OP_MARK:
5942
181
    case OP_COMMIT_ARG:
5943
216
    case OP_PRUNE_ARG:
5944
1.12k
    case OP_SKIP_ARG:
5945
1.16k
    case OP_THEN_ARG:
5946
1.16k
    code += code[1] + PRIV(OP_lengths)[*code];
5947
1.16k
    break;
5948
5949
134k
    default:
5950
134k
    return code;
5951
163k
    }
5952
163k
  }
5953
5954
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
5955
0
}
5956
5957
5958
5959
/*************************************************
5960
*           Compile one branch                   *
5961
*************************************************/
5962
5963
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
5964
the options are changed during the branch, the pointer is used to change the
5965
external options bits. This function is used during the pre-compile phase when
5966
we are trying to find out the amount of memory needed, as well as during the
5967
real compile phase. The value of lengthptr distinguishes the two phases.
5968
5969
Arguments:
5970
  optionsptr        pointer to the option bits
5971
  xoptionsptr       pointer to the extra option bits
5972
  codeptr           points to the pointer to the current code point
5973
  pptrptr           points to the current parsed pattern pointer
5974
  errorcodeptr      points to error code variable
5975
  firstcuptr        place to put the first required code unit
5976
  firstcuflagsptr   place to put the first code unit flags
5977
  reqcuptr          place to put the last required code unit
5978
  reqcuflagsptr     place to put the last required code unit flags
5979
  bcptr             points to current branch chain
5980
  open_caps         points to current capitem
5981
  cb                contains pointers to tables etc.
5982
  lengthptr         NULL during the real compile phase
5983
                    points to length accumulator during pre-compile phase
5984
5985
Returns:            0 There's been an error, *errorcodeptr is non-zero
5986
                   +1 Success, this branch must match at least one character
5987
                   -1 Success, this branch may match an empty string
5988
*/
5989
5990
static int
5991
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
5992
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
5993
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
5994
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
5995
  compile_block *cb, PCRE2_SIZE *lengthptr)
5996
2.09M
{
5997
2.09M
int bravalue = 0;
5998
2.09M
int okreturn = -1;
5999
2.09M
int group_return = 0;
6000
2.09M
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
6001
2.09M
uint32_t greedy_default, greedy_non_default;
6002
2.09M
uint32_t repeat_type, op_type;
6003
2.09M
uint32_t options = *optionsptr;               /* May change dynamically */
6004
2.09M
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
6005
2.09M
uint32_t firstcu, reqcu;
6006
2.09M
uint32_t zeroreqcu, zerofirstcu;
6007
2.09M
uint32_t *pptr = *pptrptr;
6008
2.09M
uint32_t meta, meta_arg;
6009
2.09M
uint32_t firstcuflags, reqcuflags;
6010
2.09M
uint32_t zeroreqcuflags, zerofirstcuflags;
6011
2.09M
uint32_t req_caseopt, reqvary, tempreqvary;
6012
/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,
6013
depends on the previous value of offset. */
6014
2.09M
PCRE2_SIZE offset = 0;
6015
2.09M
PCRE2_SIZE length_prevgroup = 0;
6016
2.09M
PCRE2_UCHAR *code = *codeptr;
6017
2.09M
PCRE2_UCHAR *last_code = code;
6018
2.09M
PCRE2_UCHAR *orig_code = code;
6019
2.09M
PCRE2_UCHAR *tempcode;
6020
2.09M
PCRE2_UCHAR *previous = NULL;
6021
2.09M
PCRE2_UCHAR op_previous;
6022
2.09M
BOOL groupsetfirstcu = FALSE;
6023
2.09M
BOOL had_accept = FALSE;
6024
2.09M
BOOL matched_char = FALSE;
6025
2.09M
BOOL previous_matched_char = FALSE;
6026
2.09M
BOOL reset_caseful = FALSE;
6027
6028
/* We can fish out the UTF setting once and for all into a BOOL, but we must
6029
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
6030
as we process the pattern. */
6031
6032
2.09M
#ifdef SUPPORT_UNICODE
6033
2.09M
BOOL utf = (options & PCRE2_UTF) != 0;
6034
2.09M
BOOL ucp = (options & PCRE2_UCP) != 0;
6035
#else  /* No Unicode support */
6036
BOOL utf = FALSE;
6037
#endif
6038
6039
/* Set up the default and non-default settings for greediness */
6040
6041
2.09M
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6042
2.09M
greedy_non_default = greedy_default ^ 1;
6043
6044
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6045
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6046
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6047
6048
When we hit a repeat whose minimum is zero, we may have to adjust these values
6049
to take the zero repeat into account. This is implemented by setting them to
6050
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6051
item types that can be repeated set these backoff variables appropriately. */
6052
6053
2.09M
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6054
2.09M
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6055
6056
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6057
according to the current setting of the caseless flag. The REQ_CASELESS value
6058
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6059
to record the case status of the value. This is used only for ASCII characters.
6060
*/
6061
6062
2.09M
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6063
6064
/* Switch on next META item until the end of the branch */
6065
6066
27.0M
for (;; pptr++)
6067
29.1M
  {
6068
29.1M
  BOOL possessive_quantifier;
6069
29.1M
  BOOL note_group_empty;
6070
29.1M
  uint32_t mclength;
6071
29.1M
  uint32_t skipunits;
6072
29.1M
  uint32_t subreqcu, subfirstcu;
6073
29.1M
  uint32_t groupnumber;
6074
29.1M
  uint32_t verbarglen, verbculen;
6075
29.1M
  uint32_t subreqcuflags, subfirstcuflags;
6076
29.1M
  open_capitem *oc;
6077
29.1M
  PCRE2_UCHAR mcbuffer[8];
6078
6079
  /* Get next META item in the pattern and its potential argument. */
6080
6081
29.1M
  meta = META_CODE(*pptr);
6082
29.1M
  meta_arg = META_DATA(*pptr);
6083
6084
  /* If we are in the pre-compile phase, accumulate the length used for the
6085
  previous cycle of this loop, unless the next item is a quantifier. */
6086
6087
29.1M
  if (lengthptr != NULL)
6088
14.7M
    {
6089
14.7M
    if (code > cb->start_workspace + cb->workspace_size -
6090
14.7M
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6091
0
      {
6092
0
      if (code >= cb->start_workspace + cb->workspace_size)
6093
0
        {
6094
0
        PCRE2_DEBUG_UNREACHABLE();
6095
0
        *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6096
0
        }
6097
0
      else
6098
0
        *errorcodeptr = ERR86;
6099
0
      return 0;
6100
0
      }
6101
6102
    /* There is at least one situation where code goes backwards: this is the
6103
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6104
    is processed, the whole class is eliminated. However, it is created first,
6105
    so we have to allow memory for it. Therefore, don't ever reduce the length
6106
    at this point. */
6107
6108
14.7M
    if (code < last_code) code = last_code;
6109
6110
    /* If the next thing is not a quantifier, we add the length of the previous
6111
    item into the total, and reset the code pointer to the start of the
6112
    workspace. Otherwise leave the previous item available to be quantified. */
6113
6114
14.7M
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6115
13.7M
      {
6116
13.7M
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6117
0
        {
6118
0
        *errorcodeptr = ERR20;   /* Integer overflow */
6119
0
        return 0;
6120
0
        }
6121
13.7M
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6122
13.7M
      if (*lengthptr > MAX_PATTERN_SIZE)
6123
18
        {
6124
18
        *errorcodeptr = ERR20;   /* Pattern is too large */
6125
18
        return 0;
6126
18
        }
6127
13.7M
      code = orig_code;
6128
13.7M
      }
6129
6130
    /* Remember where this code item starts so we can catch the "backwards"
6131
    case above next time round. */
6132
6133
14.7M
    last_code = code;
6134
14.7M
    }
6135
6136
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6137
  where it starts so that it can be quantified when a quantifier follows.
6138
  Checking for the legality of quantifiers happens in parse_regex(), except for
6139
  a quantifier after an assertion that is a condition. */
6140
6141
29.1M
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6142
27.2M
    {
6143
27.2M
    previous = code;
6144
27.2M
    if (matched_char && !had_accept) okreturn = 1;
6145
27.2M
    }
6146
6147
29.1M
  previous_matched_char = matched_char;
6148
29.1M
  matched_char = FALSE;
6149
29.1M
  note_group_empty = FALSE;
6150
29.1M
  skipunits = 0;         /* Default value for most subgroups */
6151
6152
29.1M
  switch(meta)
6153
29.1M
    {
6154
    /* ===================================================================*/
6155
    /* The branch terminates at pattern end or | or ) */
6156
6157
109k
    case META_END:
6158
1.56M
    case META_ALT:
6159
2.09M
    case META_KET:
6160
2.09M
    *firstcuptr = firstcu;
6161
2.09M
    *firstcuflagsptr = firstcuflags;
6162
2.09M
    *reqcuptr = reqcu;
6163
2.09M
    *reqcuflagsptr = reqcuflags;
6164
2.09M
    *codeptr = code;
6165
2.09M
    *pptrptr = pptr;
6166
2.09M
    return okreturn;
6167
6168
6169
    /* ===================================================================*/
6170
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6171
    the setting of any following char as a first character. */
6172
6173
349k
    case META_CIRCUMFLEX:
6174
349k
    if ((options & PCRE2_MULTILINE) != 0)
6175
87.2k
      {
6176
87.2k
      if (firstcuflags == REQ_UNSET)
6177
3.87k
        zerofirstcuflags = firstcuflags = REQ_NONE;
6178
87.2k
      *code++ = OP_CIRCM;
6179
87.2k
      }
6180
262k
    else *code++ = OP_CIRC;
6181
349k
    break;
6182
6183
423k
    case META_DOLLAR:
6184
423k
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6185
423k
    break;
6186
6187
    /* There can never be a first char if '.' is first, whatever happens about
6188
    repeats. The value of reqcu doesn't change either. */
6189
6190
269k
    case META_DOT:
6191
269k
    matched_char = TRUE;
6192
269k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6193
269k
    zerofirstcu = firstcu;
6194
269k
    zerofirstcuflags = firstcuflags;
6195
269k
    zeroreqcu = reqcu;
6196
269k
    zeroreqcuflags = reqcuflags;
6197
269k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6198
269k
    break;
6199
6200
6201
    /* ===================================================================*/
6202
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6203
    Otherwise, an initial ']' is taken as a data character. When empty classes
6204
    are allowed, [] must generate an empty class - we have no dedicated opcode
6205
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6206
    construct would be a clearer way for a pattern author to represent a
6207
    non-matching branch, but it does have different semantics to '[]' if both
6208
    are followed by a quantifier). The empty-negated [^] matches any character,
6209
    so is useful: generate OP_ALLANY for this. */
6210
6211
12.9k
    case META_CLASS_EMPTY:
6212
16.8k
    case META_CLASS_EMPTY_NOT:
6213
16.8k
    matched_char = TRUE;
6214
16.8k
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6215
12.9k
    else
6216
12.9k
      {
6217
12.9k
      *code++ = OP_CLASS;
6218
12.9k
      memset(code, 0, 32);
6219
12.9k
      code += 32 / sizeof(PCRE2_UCHAR);
6220
12.9k
      }
6221
6222
16.8k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6223
16.8k
    zerofirstcu = firstcu;
6224
16.8k
    zerofirstcuflags = firstcuflags;
6225
16.8k
    break;
6226
6227
6228
    /* ===================================================================*/
6229
    /* Non-empty character class. If the included characters are all < 256, we
6230
    build a 32-byte bitmap of the permitted characters, except in the special
6231
    case where there is only one such character. For negated classes, we build
6232
    the map as usual, then invert it at the end. However, we use a different
6233
    opcode so that data characters > 255 can be handled correctly.
6234
6235
    If the class contains characters outside the 0-255 range, a different
6236
    opcode is compiled. It may optionally have a bit map for characters < 256,
6237
    but those above are explicitly listed afterwards. A flag code unit tells
6238
    whether the bitmap is present, and whether this is a negated class or
6239
    not. */
6240
6241
388k
    case META_CLASS_NOT:
6242
838k
    case META_CLASS:
6243
838k
    matched_char = TRUE;
6244
6245
    /* Check for complex extended classes and handle them separately. */
6246
6247
838k
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6248
52.5k
      {
6249
52.5k
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6250
52.5k
                                      errorcodeptr, cb, lengthptr))
6251
0
        return 0;
6252
52.5k
      goto CLASS_END_PROCESSING;
6253
52.5k
      }
6254
6255
    /* We can optimize the case of a single character in a class by generating
6256
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6257
    negative. In the negative case there can be no first char if this item is
6258
    first, whatever repeat count may follow. In the case of reqcu, save the
6259
    previous value for reinstating. */
6260
6261
    /* NOTE: at present this optimization is not effective if the only
6262
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6263
6264
785k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6265
199k
      {
6266
199k
      uint32_t c = pptr[1];
6267
6268
199k
      pptr += 2;                 /* Move on to class end */
6269
199k
      if (meta == META_CLASS)    /* A positive one-char class can be */
6270
33.1k
        {                        /* handled as a normal literal character. */
6271
33.1k
        meta = c;                /* Set up the character */
6272
33.1k
        goto NORMAL_CHAR_SET;
6273
33.1k
        }
6274
6275
      /* Handle a negative one-character class */
6276
6277
166k
      zeroreqcu = reqcu;
6278
166k
      zeroreqcuflags = reqcuflags;
6279
166k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6280
166k
      zerofirstcu = firstcu;
6281
166k
      zerofirstcuflags = firstcuflags;
6282
6283
      /* For caseless UTF or UCP mode, check whether this character has more
6284
      than one other case. If so, generate a special OP_NOTPROP item instead of
6285
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6286
      caseless set that starts with an ASCII character. If the character is
6287
      affected by the special Turkish rules, hardcode the not-matching
6288
      characters using a caseset. */
6289
6290
166k
#ifdef SUPPORT_UNICODE
6291
166k
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6292
52.4k
        {
6293
52.4k
        uint32_t caseset;
6294
6295
52.4k
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6296
52.4k
              PCRE2_EXTRA_TURKISH_CASING &&
6297
52.4k
            UCD_ANY_I(c))
6298
0
          {
6299
0
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6300
0
          }
6301
52.4k
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6302
52.4k
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6303
52.4k
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6304
736
          {
6305
736
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6306
736
          }
6307
6308
52.4k
        if (caseset != 0)
6309
1.80k
          {
6310
1.80k
          *code++ = OP_NOTPROP;
6311
1.80k
          *code++ = PT_CLIST;
6312
1.80k
          *code++ = caseset;
6313
1.80k
          break;   /* We are finished with this class */
6314
1.80k
          }
6315
52.4k
        }
6316
164k
#endif
6317
      /* Char has only one other (usable) case, or UCP not available */
6318
6319
164k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6320
164k
      code += PUTCHAR(c, code);
6321
164k
      break;   /* We are finished with this class */
6322
166k
      }        /* End of 1-char optimization */
6323
6324
    /* Handle character classes that contain more than just one literal
6325
    character. If there are exactly two characters in a positive class, see if
6326
    they are case partners. This can be optimized to generate a caseless single
6327
    character match (which also sets first/required code units if relevant).
6328
    When casing restrictions apply, ignore a caseless set if both characters
6329
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6330
    Unicode "othercase". */
6331
6332
586k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6333
586k
        pptr[3] == META_CLASS_END)
6334
53.5k
      {
6335
53.5k
      uint32_t c = pptr[1];
6336
6337
53.5k
#ifdef SUPPORT_UNICODE
6338
53.5k
      if ((UCD_CASESET(c) == 0 ||
6339
53.5k
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6340
1.14k
            c < 128 && pptr[2] < 128)) &&
6341
53.5k
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6342
52.4k
              PCRE2_EXTRA_TURKISH_CASING &&
6343
52.4k
            UCD_ANY_I(c)))
6344
52.4k
#endif
6345
52.4k
        {
6346
52.4k
        uint32_t d;
6347
6348
52.4k
#ifdef SUPPORT_UNICODE
6349
52.4k
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6350
51.3k
#endif
6351
51.3k
          {
6352
#if PCRE2_CODE_UNIT_WIDTH != 8
6353
          if (c > 255) d = c; else
6354
#endif
6355
51.3k
          d = TABLE_GET(c, cb->fcc, c);
6356
51.3k
          }
6357
6358
52.4k
        if (c != d && pptr[2] == d)
6359
705
          {
6360
705
          pptr += 3;                 /* Move on to class end */
6361
705
          meta = c;
6362
705
          if ((options & PCRE2_CASELESS) == 0)
6363
418
            {
6364
418
            reset_caseful = TRUE;
6365
418
            options |= PCRE2_CASELESS;
6366
418
            req_caseopt = REQ_CASELESS;
6367
418
            }
6368
705
          goto CLASS_CASELESS_CHAR;
6369
705
          }
6370
52.4k
        }
6371
53.5k
      }
6372
6373
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6374
6375
585k
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6376
585k
                                          &code, meta == META_CLASS_NOT, NULL,
6377
585k
                                          errorcodeptr, cb, lengthptr);
6378
585k
    if (pptr == NULL) return 0;
6379
585k
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6380
6381
637k
    CLASS_END_PROCESSING:
6382
6383
    /* If this class is the first thing in the branch, there can be no first
6384
    char setting, whatever the repeat count. Any reqcu setting must remain
6385
    unchanged after any kind of repeat. */
6386
6387
637k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6388
637k
    zerofirstcu = firstcu;
6389
637k
    zerofirstcuflags = firstcuflags;
6390
637k
    zeroreqcu = reqcu;
6391
637k
    zeroreqcuflags = reqcuflags;
6392
637k
    break;  /* End of class processing */
6393
6394
6395
    /* ===================================================================*/
6396
    /* Deal with (*VERB)s. */
6397
6398
    /* Check for open captures before ACCEPT and close those that are within
6399
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6400
    assertion. In the first pass, just accumulate the length required;
6401
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6402
    workspace overflow. Do not set firstcu after *ACCEPT. */
6403
6404
8.53k
    case META_ACCEPT:
6405
8.53k
    cb->had_accept = had_accept = TRUE;
6406
8.53k
    for (oc = open_caps;
6407
16.2k
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6408
8.53k
         oc = oc->next)
6409
7.74k
      {
6410
7.74k
      if (lengthptr != NULL)
6411
3.89k
        {
6412
3.89k
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6413
3.89k
        }
6414
3.85k
      else
6415
3.85k
        {
6416
3.85k
        *code++ = OP_CLOSE;
6417
3.85k
        PUT2INC(code, 0, oc->number);
6418
3.85k
        }
6419
7.74k
      }
6420
8.53k
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6421
8.53k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6422
8.53k
    break;
6423
6424
5.19k
    case META_PRUNE:
6425
12.4k
    case META_SKIP:
6426
12.4k
    cb->had_pruneorskip = TRUE;
6427
    /* Fall through */
6428
16.1k
    case META_COMMIT:
6429
21.1k
    case META_FAIL:
6430
21.1k
    *code++ = verbops[(meta - META_MARK) >> 16];
6431
21.1k
    break;
6432
6433
11.3k
    case META_THEN:
6434
11.3k
    cb->external_flags |= PCRE2_HASTHEN;
6435
11.3k
    *code++ = OP_THEN;
6436
11.3k
    break;
6437
6438
    /* Handle verbs with arguments. Arguments can be very long, especially in
6439
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6440
    However, the argument length is constrained to be small enough to fit in
6441
    one code unit. This check happens in parse_regex(). In the first pass,
6442
    instead of putting the argument into memory, we just update the length
6443
    counter and set up an empty argument. */
6444
6445
2.24k
    case META_THEN_ARG:
6446
2.24k
    cb->external_flags |= PCRE2_HASTHEN;
6447
2.24k
    goto VERB_ARG;
6448
6449
551
    case META_PRUNE_ARG:
6450
6.67k
    case META_SKIP_ARG:
6451
6.67k
    cb->had_pruneorskip = TRUE;
6452
    /* Fall through */
6453
9.43k
    case META_MARK:
6454
10.9k
    case META_COMMIT_ARG:
6455
13.2k
    VERB_ARG:
6456
13.2k
    *code++ = verbops[(meta - META_MARK) >> 16];
6457
    /* The length is in characters. */
6458
13.2k
    verbarglen = *(++pptr);
6459
13.2k
    verbculen = 0;
6460
13.2k
    tempcode = code++;
6461
130k
    for (int i = 0; i < (int)verbarglen; i++)
6462
117k
      {
6463
117k
      meta = *(++pptr);
6464
117k
#ifdef SUPPORT_UNICODE
6465
117k
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6466
105k
#endif
6467
105k
        {
6468
105k
        mclength = 1;
6469
105k
        mcbuffer[0] = meta;
6470
105k
        }
6471
117k
      if (lengthptr != NULL) *lengthptr += mclength; else
6472
55.9k
        {
6473
55.9k
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6474
55.9k
        code += mclength;
6475
55.9k
        verbculen += mclength;
6476
55.9k
        }
6477
117k
      }
6478
6479
13.2k
    *tempcode = verbculen;   /* Fill in the code unit length */
6480
13.2k
    *code++ = 0;             /* Terminating zero */
6481
13.2k
    break;
6482
6483
6484
    /* ===================================================================*/
6485
    /* Handle options change. The new setting must be passed back for use in
6486
    subsequent branches. Reset the greedy defaults and the case value for
6487
    firstcu and reqcu. */
6488
6489
4.54k
    case META_OPTIONS:
6490
4.54k
    *optionsptr = options = *(++pptr);
6491
4.54k
    *xoptionsptr = xoptions = *(++pptr);
6492
4.54k
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6493
4.54k
    greedy_non_default = greedy_default ^ 1;
6494
4.54k
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6495
4.54k
    break;
6496
6497
    /* ===================================================================*/
6498
    /* Handle scan substring. Scan substring assertion starts with META_SCS,
6499
    which recursively calls compile_branch. The first opcode processed by
6500
    this recursive call is always META_OFFSET. */
6501
6502
0
    case META_OFFSET:
6503
0
    if (lengthptr != NULL)
6504
0
      {
6505
0
      pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);
6506
0
      if (pptr == NULL)
6507
0
        return 0;
6508
0
      break;
6509
0
      }
6510
6511
0
    while (TRUE)
6512
0
      {
6513
0
      int count, index;
6514
0
      named_group *ng;
6515
6516
0
      switch (META_CODE(*pptr))
6517
0
        {
6518
0
        case META_OFFSET:
6519
0
        pptr++;
6520
0
        SKIPOFFSET(pptr);
6521
0
        continue;
6522
6523
0
        case META_CAPTURE_NAME:
6524
0
        ng = cb->named_groups + pptr[1];
6525
0
        pptr += 2;
6526
0
        count = 0;
6527
0
        index = 0;
6528
6529
0
        if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6530
0
          &count, errorcodeptr, cb)) return 0;
6531
6532
0
        code[0] = OP_DNCREF;
6533
0
        PUT2(code, 1, index);
6534
0
        PUT2(code, 1 + IMM2_SIZE, count);
6535
0
        code += 1 + 2 * IMM2_SIZE;
6536
0
        continue;
6537
6538
0
        case META_CAPTURE_NUMBER:
6539
0
        pptr += 2;
6540
0
        if (pptr[-1] == 0) continue;
6541
6542
0
        code[0] = OP_CREF;
6543
0
        PUT2(code, 1, pptr[-1]);
6544
0
        code += 1 + IMM2_SIZE;
6545
0
        continue;
6546
6547
0
        default:
6548
0
        break;
6549
0
        }
6550
6551
0
      break;
6552
0
      }
6553
0
    --pptr;
6554
0
    break;
6555
6556
0
    case META_SCS:
6557
0
    bravalue = OP_ASSERT_SCS;
6558
0
    cb->assert_depth += 1;
6559
0
    goto GROUP_PROCESS;
6560
6561
6562
    /* ===================================================================*/
6563
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6564
    because it could be a numerical check on recursion, or a name check on a
6565
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6566
    we can handle it either way. We first try for a name; if not found, process
6567
    the number. */
6568
6569
5.06k
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6570
6.24k
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6571
8.00k
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6572
8.00k
    bravalue = OP_COND;
6573
6574
8.00k
    if (lengthptr != NULL)
6575
4.06k
      {
6576
4.06k
      uint32_t i;
6577
4.06k
      PCRE2_SPTR name;
6578
4.06k
      named_group *ng;
6579
4.06k
      uint32_t *start_pptr = pptr;
6580
4.06k
      uint32_t length = *(++pptr);
6581
6582
4.06k
      GETPLUSOFFSET(offset, pptr);
6583
4.06k
      name = cb->start_pattern + offset;
6584
6585
      /* In the first pass, the names generated in the pre-pass are available,
6586
      but the main name table has not yet been created. Scan the list of names
6587
      generated in the pre-pass in order to get a number and whether or not
6588
      this name is duplicated. If it is not duplicated, we can handle it as a
6589
      numerical group. */
6590
6591
4.06k
      ng = PRIV(compile_find_named_group)(name, length, cb);
6592
6593
4.06k
      if (ng == NULL)
6594
2.53k
        {
6595
        /* If the name was not found we have a bad reference, unless we are
6596
        dealing with R<digits>, which is treated as a recursion test by
6597
        number. */
6598
6599
2.53k
        groupnumber = 0;
6600
2.53k
        if (meta == META_COND_RNUMBER)
6601
2.48k
          {
6602
3.13k
          for (i = 1; i < length; i++)
6603
656
            {
6604
656
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6605
656
            if (groupnumber > MAX_GROUP_NUMBER)
6606
3
              {
6607
3
              *errorcodeptr = ERR61;
6608
3
              cb->erroroffset = offset + i;
6609
3
              return 0;
6610
3
              }
6611
656
            }
6612
2.48k
          }
6613
6614
2.52k
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6615
63
          {
6616
63
          *errorcodeptr = ERR15;
6617
63
          cb->erroroffset = offset;
6618
63
          return 0;
6619
63
          }
6620
6621
        /* (?Rdigits) treated as a recursion reference by number. A value of
6622
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6623
        translated into RREF_ANY (which is 0xffff). */
6624
6625
2.46k
        if (groupnumber == 0) groupnumber = RREF_ANY;
6626
2.46k
        PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);
6627
2.46k
        start_pptr[1] = groupnumber;
6628
2.46k
        skipunits = 1+IMM2_SIZE;
6629
2.46k
        goto GROUP_PROCESS_NOTE_EMPTY;
6630
2.52k
        }
6631
6632
      /* From here on, we know we have a name (not a number),
6633
      so treat META_COND_RNUMBER the same as META_COND_NAME. */
6634
1.53k
      if (meta == META_COND_RNUMBER) meta = META_COND_NAME;
6635
6636
1.53k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
6637
394
        {
6638
        /* Found a non-duplicated name. Since it is a global,
6639
        it is enough to update it in the pre-processing phase. */
6640
394
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6641
6642
394
        start_pptr[0] = meta;
6643
394
        start_pptr[1] = ng->number;
6644
6645
394
        skipunits = 1 + IMM2_SIZE;
6646
394
        goto GROUP_PROCESS_NOTE_EMPTY;
6647
394
        }
6648
6649
      /* We have a duplicated name. In the compile pass we have to search the
6650
      main table in order to get the index and count values. */
6651
6652
1.14k
      start_pptr[0] = meta | 1;
6653
1.14k
      start_pptr[1] = (uint32_t)(ng - cb->named_groups);
6654
6655
      /* A duplicated name was found. Note that if an R<digits> name is found
6656
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6657
1.14k
      skipunits = 1 + 2 * IMM2_SIZE;
6658
1.14k
      }
6659
3.93k
    else
6660
3.93k
      {
6661
      /* Otherwise lengthptr equals to NULL,
6662
      which is the second phase of compilation. */
6663
3.93k
      int count, index;
6664
3.93k
      named_group *ng;
6665
6666
      /* Generate code using the data
6667
      collected in the pre-processing phase. */
6668
6669
3.93k
      if (meta == META_COND_RNUMBER)
6670
2.45k
        {
6671
2.45k
        code[1+LINK_SIZE] = OP_RREF;
6672
2.45k
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6673
2.45k
        skipunits = 1 + IMM2_SIZE;
6674
2.45k
        pptr += 1 + SIZEOFFSET;
6675
2.45k
        goto GROUP_PROCESS_NOTE_EMPTY;
6676
2.45k
        }
6677
6678
1.48k
      if (meta_arg == 0)
6679
377
        {
6680
377
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6681
377
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6682
377
        skipunits = 1 + IMM2_SIZE;
6683
377
        pptr += 1 + SIZEOFFSET;
6684
377
        goto GROUP_PROCESS_NOTE_EMPTY;
6685
377
        }
6686
6687
1.10k
      ng = cb->named_groups + pptr[1];
6688
1.10k
      count = 0;  /* Values for first pass (avoids compiler warning) */
6689
1.10k
      index = 0;
6690
6691
      /* The failed case is an internal error. */
6692
1.10k
      if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6693
1.10k
            &count, errorcodeptr, cb)) return 0;
6694
6695
      /* A duplicated name was found. Note that if an R<digits> name is found
6696
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6697
6698
1.10k
      code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6699
6700
      /* Insert appropriate data values. */
6701
1.10k
      PUT2(code, 2 + LINK_SIZE, index);
6702
1.10k
      PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);
6703
1.10k
      skipunits = 1 + 2 * IMM2_SIZE;
6704
1.10k
      pptr += 1 + SIZEOFFSET;
6705
1.10k
      }
6706
6707
2.24k
    PCRE2_ASSERT(meta != META_CAPTURE_NAME);
6708
2.24k
    goto GROUP_PROCESS_NOTE_EMPTY;
6709
6710
    /* The DEFINE condition is always false. Its internal groups may never
6711
    be called, so matched_char must remain false, hence the jump to
6712
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6713
6714
0
    case META_COND_DEFINE:
6715
0
    bravalue = OP_COND;
6716
0
    GETPLUSOFFSET(offset, pptr);
6717
0
    code[1+LINK_SIZE] = OP_DEFINE;
6718
0
    skipunits = 1;
6719
0
    goto GROUP_PROCESS;
6720
6721
    /* Conditional test of a group's being set. */
6722
6723
552
    case META_COND_NUMBER:
6724
552
    bravalue = OP_COND;
6725
552
    GETPLUSOFFSET(offset, pptr);
6726
6727
552
    groupnumber = *(++pptr);
6728
552
    if (groupnumber > cb->bracount)
6729
10
      {
6730
10
      *errorcodeptr = ERR15;
6731
10
      cb->erroroffset = offset;
6732
10
      return 0;
6733
10
      }
6734
542
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6735
6736
    /* Point at initial ( for too many branches error */
6737
542
    offset -= 2;
6738
542
    code[1+LINK_SIZE] = OP_CREF;
6739
542
    skipunits = 1+IMM2_SIZE;
6740
542
    PUT2(code, 2+LINK_SIZE, groupnumber);
6741
542
    goto GROUP_PROCESS_NOTE_EMPTY;
6742
6743
    /* Test for the PCRE2 version. */
6744
6745
1.01k
    case META_COND_VERSION:
6746
1.01k
    bravalue = OP_COND;
6747
1.01k
    if (pptr[1] > 0)
6748
289
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6749
289
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6750
215
          OP_TRUE : OP_FALSE;
6751
723
    else
6752
723
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6753
723
        OP_TRUE : OP_FALSE;
6754
1.01k
    skipunits = 1;
6755
1.01k
    pptr += 3;
6756
1.01k
    goto GROUP_PROCESS_NOTE_EMPTY;
6757
6758
    /* The condition is an assertion, possibly preceded by a callout. */
6759
6760
20.1k
    case META_COND_ASSERT:
6761
20.1k
    bravalue = OP_COND;
6762
20.1k
    goto GROUP_PROCESS_NOTE_EMPTY;
6763
6764
6765
    /* ===================================================================*/
6766
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6767
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6768
6769
29.1k
    case META_LOOKAHEAD:
6770
29.1k
    bravalue = OP_ASSERT;
6771
29.1k
    cb->assert_depth += 1;
6772
29.1k
    goto GROUP_PROCESS;
6773
6774
13.2k
    case META_LOOKAHEAD_NA:
6775
13.2k
    bravalue = OP_ASSERT_NA;
6776
13.2k
    cb->assert_depth += 1;
6777
13.2k
    goto GROUP_PROCESS;
6778
6779
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6780
    thing to do, but Perl allows all assertions to be quantified, and when
6781
    they contain capturing parentheses there may be a potential use for
6782
    this feature. Not that that applies to a quantified (?!) but we allow
6783
    it for uniformity. */
6784
6785
16.3k
    case META_LOOKAHEADNOT:
6786
16.3k
    if (pptr[1] == META_KET &&
6787
16.3k
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6788
941
      {
6789
941
      *code++ = OP_FAIL;
6790
941
      pptr++;
6791
941
      }
6792
15.3k
    else
6793
15.3k
      {
6794
15.3k
      bravalue = OP_ASSERT_NOT;
6795
15.3k
      cb->assert_depth += 1;
6796
15.3k
      goto GROUP_PROCESS;
6797
15.3k
      }
6798
941
    break;
6799
6800
7.57k
    case META_LOOKBEHIND:
6801
7.57k
    bravalue = OP_ASSERTBACK;
6802
7.57k
    cb->assert_depth += 1;
6803
7.57k
    goto GROUP_PROCESS;
6804
6805
14.8k
    case META_LOOKBEHINDNOT:
6806
14.8k
    bravalue = OP_ASSERTBACK_NOT;
6807
14.8k
    cb->assert_depth += 1;
6808
14.8k
    goto GROUP_PROCESS;
6809
6810
5.85k
    case META_LOOKBEHIND_NA:
6811
5.85k
    bravalue = OP_ASSERTBACK_NA;
6812
5.85k
    cb->assert_depth += 1;
6813
5.85k
    goto GROUP_PROCESS;
6814
6815
7.01k
    case META_ATOMIC:
6816
7.01k
    bravalue = OP_ONCE;
6817
7.01k
    goto GROUP_PROCESS_NOTE_EMPTY;
6818
6819
3.18k
    case META_SCRIPT_RUN:
6820
3.18k
    bravalue = OP_SCRIPT_RUN;
6821
3.18k
    goto GROUP_PROCESS_NOTE_EMPTY;
6822
6823
94.3k
    case META_NOCAPTURE:
6824
94.3k
    bravalue = OP_BRA;
6825
    /* Fall through */
6826
6827
    /* Process nested bracketed regex. The nesting depth is maintained for the
6828
    benefit of the stackguard function. The test for too deep nesting is now
6829
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6830
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6831
    note of whether or not they may match an empty string. */
6832
6833
446k
    GROUP_PROCESS_NOTE_EMPTY:
6834
446k
    note_group_empty = TRUE;
6835
6836
532k
    GROUP_PROCESS:
6837
532k
    cb->parens_depth += 1;
6838
532k
    *code = bravalue;
6839
532k
    pptr++;
6840
532k
    tempcode = code;
6841
532k
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6842
532k
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6843
6844
532k
    if ((group_return =
6845
532k
         compile_regex(
6846
532k
         options,                         /* The options state */
6847
532k
         xoptions,                        /* The extra options state */
6848
532k
         &tempcode,                       /* Where to put code (updated) */
6849
532k
         &pptr,                           /* Input pointer (updated) */
6850
532k
         errorcodeptr,                    /* Where to put an error message */
6851
532k
         skipunits,                       /* Skip over bracket number */
6852
532k
         &subfirstcu,                     /* For possible first char */
6853
532k
         &subfirstcuflags,
6854
532k
         &subreqcu,                       /* For possible last char */
6855
532k
         &subreqcuflags,
6856
532k
         bcptr,                           /* Current branch chain */
6857
532k
         open_caps,                       /* Pointer to capture stack */
6858
532k
         cb,                              /* Compile data block */
6859
532k
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6860
532k
           &length_prevgroup              /* Pre-compile phase */
6861
532k
         )) == 0)
6862
1.17k
      return 0;  /* Error */
6863
6864
531k
    cb->parens_depth -= 1;
6865
6866
    /* If that was a non-conditional significant group (not an assertion, not a
6867
    DEFINE) that matches at least one character, then the current item matches
6868
    a character. Conditionals are handled below. */
6869
6870
531k
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6871
147k
      matched_char = TRUE;
6872
6873
    /* If we've just compiled an assertion, pop the assert depth. */
6874
6875
531k
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6876
85.8k
      cb->assert_depth -= 1;
6877
6878
    /* At the end of compiling, code is still pointing to the start of the
6879
    group, while tempcode has been updated to point past the end of the group.
6880
    The parsed pattern pointer (pptr) is on the closing META_KET.
6881
6882
    If this is a conditional bracket, check that there are no more than
6883
    two branches in the group, or just one if it's a DEFINE group. We do this
6884
    in the real compile phase, not in the pre-pass, where the whole group may
6885
    not be available. */
6886
6887
531k
    if (bravalue == OP_COND && lengthptr == NULL)
6888
14.7k
      {
6889
14.7k
      PCRE2_UCHAR *tc = code;
6890
14.7k
      int condcount = 0;
6891
6892
17.6k
      do {
6893
17.6k
         condcount++;
6894
17.6k
         tc += GET(tc,1);
6895
17.6k
         }
6896
17.6k
      while (*tc != OP_KET);
6897
6898
      /* A DEFINE group is never obeyed inline (the "condition" is always
6899
      false). It must have only one branch. Having checked this, change the
6900
      opcode to OP_FALSE. */
6901
6902
14.7k
      if (code[LINK_SIZE+1] == OP_DEFINE)
6903
0
        {
6904
0
        if (condcount > 1)
6905
0
          {
6906
0
          cb->erroroffset = offset;
6907
0
          *errorcodeptr = ERR54;
6908
0
          return 0;
6909
0
          }
6910
0
        code[LINK_SIZE+1] = OP_FALSE;
6911
0
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6912
0
        }
6913
6914
      /* A "normal" conditional group. If there is just one branch, we must not
6915
      make use of its firstcu or reqcu, because this is equivalent to an
6916
      empty second branch. Also, it may match an empty string. If there are two
6917
      branches, this item must match a character if the group must. */
6918
6919
14.7k
      else
6920
14.7k
        {
6921
14.7k
        if (condcount > 2)
6922
23
          {
6923
23
          cb->erroroffset = offset;
6924
23
          *errorcodeptr = ERR27;
6925
23
          return 0;
6926
23
          }
6927
14.6k
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
6928
2.54k
          else if (group_return > 0) matched_char = TRUE;
6929
14.6k
        }
6930
14.7k
      }
6931
6932
    /* In the pre-compile phase, update the length by the length of the group,
6933
    less the brackets at either end. Then reduce the compiled code to just a
6934
    set of non-capturing brackets so that it doesn't use much memory if it is
6935
    duplicated by a quantifier.*/
6936
6937
530k
    if (lengthptr != NULL)
6938
267k
      {
6939
267k
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6940
0
        {
6941
0
        *errorcodeptr = ERR20;
6942
0
        return 0;
6943
0
        }
6944
267k
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6945
267k
      code++;   /* This already contains bravalue */
6946
267k
      PUTINC(code, 0, 1 + LINK_SIZE);
6947
267k
      *code++ = OP_KET;
6948
267k
      PUTINC(code, 0, 1 + LINK_SIZE);
6949
267k
      break;    /* No need to waste time with special character handling */
6950
267k
      }
6951
6952
    /* Otherwise update the main code pointer to the end of the group. */
6953
6954
263k
    code = tempcode;
6955
6956
    /* For a DEFINE group, required and first character settings are not
6957
    relevant. */
6958
6959
263k
    if (bravalue == OP_DEFINE) break;
6960
6961
    /* Handle updating of the required and first code units for other types of
6962
    group. Update for normal brackets of all kinds, and conditions with two
6963
    branches (see code above). If the bracket is followed by a quantifier with
6964
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
6965
    zerofirstcu outside the main loop so that they can be accessed for the back
6966
    off. */
6967
6968
263k
    zeroreqcu = reqcu;
6969
263k
    zeroreqcuflags = reqcuflags;
6970
263k
    zerofirstcu = firstcu;
6971
263k
    zerofirstcuflags = firstcuflags;
6972
263k
    groupsetfirstcu = FALSE;
6973
6974
263k
    if (bravalue >= OP_ONCE)  /* Not an assertion */
6975
220k
      {
6976
      /* If we have not yet set a firstcu in this branch, take it from the
6977
      subpattern, remembering that it was set here so that a repeat of more
6978
      than one can replicate it as reqcu if necessary. If the subpattern has
6979
      no firstcu, set "none" for the whole branch. In both cases, a zero
6980
      repeat forces firstcu to "none". */
6981
6982
220k
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
6983
44.5k
        {
6984
44.5k
        if (subfirstcuflags < REQ_NONE)
6985
10.2k
          {
6986
10.2k
          firstcu = subfirstcu;
6987
10.2k
          firstcuflags = subfirstcuflags;
6988
10.2k
          groupsetfirstcu = TRUE;
6989
10.2k
          }
6990
34.2k
        else firstcuflags = REQ_NONE;
6991
44.5k
        zerofirstcuflags = REQ_NONE;
6992
44.5k
        }
6993
6994
      /* If firstcu was previously set, convert the subpattern's firstcu
6995
      into reqcu if there wasn't one, using the vary flag that was in
6996
      existence beforehand. */
6997
6998
176k
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
6999
6.61k
        {
7000
6.61k
        subreqcu = subfirstcu;
7001
6.61k
        subreqcuflags = subfirstcuflags | tempreqvary;
7002
6.61k
        }
7003
7004
      /* If the subpattern set a required code unit (or set a first code unit
7005
      that isn't really the first code unit - see above), set it. */
7006
7007
220k
      if (subreqcuflags < REQ_NONE)
7008
59.3k
        {
7009
59.3k
        reqcu = subreqcu;
7010
59.3k
        reqcuflags = subreqcuflags;
7011
59.3k
        }
7012
220k
      }
7013
7014
    /* For a forward assertion, we take the reqcu, if set, provided that the
7015
    group has also set a firstcu. This can be helpful if the pattern that
7016
    follows the assertion doesn't set a different char. For example, it's
7017
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7018
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7019
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7020
    overcome by a scan at the end if there's no firstcu, looking for an
7021
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7022
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7023
    in that example, 'X' ends up set for both. */
7024
7025
42.5k
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7026
42.5k
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7027
2.69k
      {
7028
2.69k
      reqcu = subreqcu;
7029
2.69k
      reqcuflags = subreqcuflags;
7030
2.69k
      }
7031
7032
263k
    break;  /* End of nested group handling */
7033
7034
7035
    /* ===================================================================*/
7036
    /* Handle named backreferences and recursions. */
7037
7038
22.4k
    case META_BACKREF_BYNAME:
7039
22.4k
    case META_RECURSE_BYNAME:
7040
22.4k
      {
7041
22.4k
      int count, index;
7042
22.4k
      PCRE2_SPTR name;
7043
22.4k
      named_group *ng;
7044
22.4k
      uint32_t length = *(++pptr);
7045
7046
22.4k
      GETPLUSOFFSET(offset, pptr);
7047
22.4k
      name = cb->start_pattern + offset;
7048
7049
      /* In the first pass, the names generated in the pre-pass are available,
7050
      but the main name table has not yet been created. Scan the list of names
7051
      generated in the pre-pass in order to get a number and whether or not
7052
      this name is duplicated. */
7053
7054
22.4k
      groupnumber = 0;
7055
22.4k
      ng = PRIV(compile_find_named_group)(name, length, cb);
7056
7057
22.4k
      if (ng == NULL)
7058
61
        {
7059
        /* If the name was not found we have a bad reference. */
7060
61
        *errorcodeptr = ERR15;
7061
61
        cb->erroroffset = offset;
7062
61
        return 0;
7063
61
        }
7064
7065
22.4k
      groupnumber = ng->number;
7066
7067
      /* For a recursion, that's all that is needed. We can now go to
7068
      the code that handles numerical recursion, applying it to the first
7069
      group with the given name. */
7070
7071
22.4k
      if (meta == META_RECURSE_BYNAME)
7072
46
        {
7073
46
        meta_arg = groupnumber;
7074
46
        goto HANDLE_NUMERICAL_RECURSION;
7075
46
        }
7076
7077
      /* For a back reference, update the back reference map and the
7078
      maximum back reference. */
7079
7080
22.3k
      cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7081
22.3k
      if (groupnumber > cb->top_backref)
7082
681
        cb->top_backref = groupnumber;
7083
7084
      /* If a back reference name is not duplicated, we can handle it as
7085
      a numerical reference. */
7086
7087
22.3k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
7088
791
        {
7089
791
        meta_arg = groupnumber;
7090
791
        goto HANDLE_SINGLE_REFERENCE;
7091
791
        }
7092
7093
      /* If a back reference name is duplicated, we generate a different
7094
      opcode to a numerical back reference. In the second pass we must
7095
      search for the index and count in the final name table. */
7096
7097
21.6k
      count = 0;  /* Values for first pass (avoids compiler warning) */
7098
21.6k
      index = 0;
7099
21.6k
      if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,
7100
10.5k
            &index, &count, errorcodeptr, cb)) return 0;
7101
7102
21.6k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7103
21.6k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7104
21.6k
      PUT2INC(code, 0, index);
7105
21.6k
      PUT2INC(code, 0, count);
7106
21.6k
      if ((options & PCRE2_CASELESS) != 0)
7107
7.64k
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7108
7.15k
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7109
7.64k
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7110
7.64k
                   REFI_FLAG_TURKISH_CASING : 0);
7111
21.6k
      }
7112
0
    break;
7113
7114
7115
    /* ===================================================================*/
7116
    /* Handle a numerical callout. */
7117
7118
2.78M
    case META_CALLOUT_NUMBER:
7119
2.78M
    code[0] = OP_CALLOUT;
7120
2.78M
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7121
2.78M
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7122
2.78M
    code[1 + 2*LINK_SIZE] = pptr[3];
7123
2.78M
    pptr += 3;
7124
2.78M
    code += PRIV(OP_lengths)[OP_CALLOUT];
7125
2.78M
    break;
7126
7127
7128
    /* ===================================================================*/
7129
    /* Handle a callout with a string argument. In the pre-pass we just compute
7130
    the length without generating anything. The length in pptr[3] includes both
7131
    delimiters; in the actual compile only the first one is copied, but a
7132
    terminating zero is added. Any doubled delimiters within the string make
7133
    this an overestimate, but it is not worth bothering about. */
7134
7135
10.8k
    case META_CALLOUT_STRING:
7136
10.8k
    if (lengthptr != NULL)
7137
5.45k
      {
7138
5.45k
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7139
5.45k
      pptr += 3;
7140
5.45k
      SKIPOFFSET(pptr);
7141
5.45k
      }
7142
7143
    /* In the real compile we can copy the string. The starting delimiter is
7144
     included so that the client can discover it if they want. We also pass the
7145
     start offset to help a script language give better error messages. */
7146
7147
5.42k
    else
7148
5.42k
      {
7149
5.42k
      PCRE2_SPTR pp;
7150
5.42k
      uint32_t delimiter;
7151
5.42k
      uint32_t length = pptr[3];
7152
5.42k
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7153
7154
5.42k
      code[0] = OP_CALLOUT_STR;
7155
5.42k
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7156
5.42k
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7157
7158
5.42k
      pptr += 3;
7159
5.42k
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7160
5.42k
      pp = cb->start_pattern + offset;
7161
5.42k
      delimiter = *callout_string++ = *pp++;
7162
5.42k
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7163
752
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7164
5.42k
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7165
7166
      /* The syntax of the pattern was checked in the parsing scan. The length
7167
      includes both delimiters, but we have passed the opening one just above,
7168
      so we reduce length before testing it. The test is for > 1 because we do
7169
      not want to copy the final delimiter. This also ensures that pp[1] is
7170
      accessible. */
7171
7172
19.7k
      while (--length > 1)
7173
14.3k
        {
7174
14.3k
        if (*pp == delimiter && pp[1] == delimiter)
7175
432
          {
7176
432
          *callout_string++ = delimiter;
7177
432
          pp += 2;
7178
432
          length--;
7179
432
          }
7180
13.9k
        else *callout_string++ = *pp++;
7181
14.3k
        }
7182
5.42k
      *callout_string++ = CHAR_NUL;
7183
7184
      /* Set the length of the entire item, the advance to its end. */
7185
7186
5.42k
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7187
5.42k
      code = callout_string;
7188
5.42k
      }
7189
10.8k
    break;
7190
7191
7192
    /* ===================================================================*/
7193
    /* Handle repetition. The different types are all sorted out in the parsing
7194
    pass. */
7195
7196
202k
    case META_MINMAX_PLUS:
7197
206k
    case META_MINMAX_QUERY:
7198
375k
    case META_MINMAX:
7199
375k
    repeat_min = *(++pptr);
7200
375k
    repeat_max = *(++pptr);
7201
375k
    goto REPEAT;
7202
7203
524k
    case META_ASTERISK:
7204
532k
    case META_ASTERISK_PLUS:
7205
535k
    case META_ASTERISK_QUERY:
7206
535k
    repeat_min = 0;
7207
535k
    repeat_max = REPEAT_UNLIMITED;
7208
535k
    goto REPEAT;
7209
7210
552k
    case META_PLUS:
7211
589k
    case META_PLUS_PLUS:
7212
600k
    case META_PLUS_QUERY:
7213
600k
    repeat_min = 1;
7214
600k
    repeat_max = REPEAT_UNLIMITED;
7215
600k
    goto REPEAT;
7216
7217
416k
    case META_QUERY:
7218
426k
    case META_QUERY_PLUS:
7219
433k
    case META_QUERY_QUERY:
7220
433k
    repeat_min = 0;
7221
433k
    repeat_max = 1;
7222
7223
1.94M
    REPEAT:
7224
1.94M
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7225
7226
    /* Remember whether this is a variable length repeat, and default to
7227
    single-char opcodes. */
7228
7229
1.94M
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7230
7231
    /* Adjust first and required code units for a zero repeat. */
7232
7233
1.94M
    if (repeat_min == 0)
7234
1.10M
      {
7235
1.10M
      firstcu = zerofirstcu;
7236
1.10M
      firstcuflags = zerofirstcuflags;
7237
1.10M
      reqcu = zeroreqcu;
7238
1.10M
      reqcuflags = zeroreqcuflags;
7239
1.10M
      }
7240
7241
    /* Note the greediness and possessiveness. */
7242
7243
1.94M
    switch (meta)
7244
1.94M
      {
7245
202k
      case META_MINMAX_PLUS:
7246
209k
      case META_ASTERISK_PLUS:
7247
246k
      case META_PLUS_PLUS:
7248
256k
      case META_QUERY_PLUS:
7249
256k
      repeat_type = 0;                  /* Force greedy */
7250
256k
      possessive_quantifier = TRUE;
7251
256k
      break;
7252
7253
4.21k
      case META_MINMAX_QUERY:
7254
7.81k
      case META_ASTERISK_QUERY:
7255
19.0k
      case META_PLUS_QUERY:
7256
26.0k
      case META_QUERY_QUERY:
7257
26.0k
      repeat_type = greedy_non_default;
7258
26.0k
      possessive_quantifier = FALSE;
7259
26.0k
      break;
7260
7261
1.66M
      default:
7262
1.66M
      repeat_type = greedy_default;
7263
1.66M
      possessive_quantifier = FALSE;
7264
1.66M
      break;
7265
1.94M
      }
7266
7267
    /* Save start of previous item, in case we have to move it up in order to
7268
    insert something before it, and remember what it was. */
7269
7270
1.94M
    PCRE2_ASSERT(previous != NULL);
7271
1.94M
    tempcode = previous;
7272
1.94M
    op_previous = *previous;
7273
7274
    /* Now handle repetition for the different types of item. If the repeat
7275
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7276
    non-parenthesized items, as they have only one alternative. For anything in
7277
    parentheses, we must not ignore if {1} is possessive. */
7278
7279
1.94M
    switch (op_previous)
7280
1.94M
      {
7281
      /* If previous was a character or negated character match, abolish the
7282
      item and generate a repeat item instead. If a char item has a minimum of
7283
      more than one, ensure that it is set in reqcu - it might not be if a
7284
      sequence such as x{3} is the first thing in a branch because the x will
7285
      have gone into firstcu instead.  */
7286
7287
713k
      case OP_CHAR:
7288
1.01M
      case OP_CHARI:
7289
1.06M
      case OP_NOT:
7290
1.10M
      case OP_NOTI:
7291
1.10M
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7292
1.10M
      op_type = chartypeoffset[op_previous - OP_CHAR];
7293
7294
      /* Deal with UTF characters that take up more than one code unit. */
7295
7296
1.10M
#ifdef MAYBE_UTF_MULTI
7297
1.10M
      if (utf && NOT_FIRSTCU(code[-1]))
7298
38.8k
        {
7299
38.8k
        PCRE2_UCHAR *lastchar = code - 1;
7300
38.8k
        BACKCHAR(lastchar);
7301
38.8k
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7302
38.8k
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7303
38.8k
        }
7304
1.06M
      else
7305
1.06M
#endif  /* MAYBE_UTF_MULTI */
7306
7307
      /* Handle the case of a single code unit - either with no UTF support, or
7308
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7309
      case, for a repeated positive match, get the caseless flag for the
7310
      required code unit from the previous character, because a class like [Aa]
7311
      sets a caseless A but by now the req_caseopt flag has been reset. */
7312
7313
1.06M
        {
7314
1.06M
        mcbuffer[0] = code[-1];
7315
1.06M
        mclength = 1;
7316
1.06M
        if (op_previous <= OP_CHARI && repeat_min > 1)
7317
39.3k
          {
7318
39.3k
          reqcu = mcbuffer[0];
7319
39.3k
          reqcuflags = cb->req_varyopt;
7320
39.3k
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7321
39.3k
          }
7322
1.06M
        }
7323
1.10M
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7324
7325
      /* If previous was a character class or a back reference, we put the
7326
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7327
7328
0
#ifdef SUPPORT_WIDE_CHARS
7329
50.3k
      case OP_XCLASS:
7330
53.7k
      case OP_ECLASS:
7331
53.7k
#endif
7332
202k
      case OP_CLASS:
7333
313k
      case OP_NCLASS:
7334
319k
      case OP_REF:
7335
322k
      case OP_REFI:
7336
326k
      case OP_DNREF:
7337
327k
      case OP_DNREFI:
7338
7339
327k
      if (repeat_max == 0)
7340
3.73k
        {
7341
3.73k
        code = previous;
7342
3.73k
        goto END_REPEAT;
7343
3.73k
        }
7344
324k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7345
7346
323k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7347
82.2k
        *code++ = OP_CRSTAR + repeat_type;
7348
241k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7349
63.0k
        *code++ = OP_CRPLUS + repeat_type;
7350
178k
      else if (repeat_min == 0 && repeat_max == 1)
7351
58.9k
        *code++ = OP_CRQUERY + repeat_type;
7352
119k
      else
7353
119k
        {
7354
119k
        *code++ = OP_CRRANGE + repeat_type;
7355
119k
        PUT2INC(code, 0, repeat_min);
7356
119k
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7357
119k
        PUT2INC(code, 0, repeat_max);
7358
119k
        }
7359
323k
      break;
7360
7361
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7362
      because pcre2_match() could not handle backtracking into recursively
7363
      called groups. Now that this backtracking is available, we no longer need
7364
      to do this. However, we still need to replicate recursions as we do for
7365
      groups so as to have independent backtracking points. We can replicate
7366
      for the minimum number of repeats directly. For optional repeats we now
7367
      wrap the recursion in OP_BRA brackets and make use of the bracket
7368
      repetition. */
7369
7370
6.02k
      case OP_RECURSE:
7371
6.02k
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7372
94
        goto END_REPEAT;
7373
7374
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7375
      minimum is 1 and the maximum unlimited, because that can be handled with
7376
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7377
      minimum, we just need to generate the appropriate additional copies.
7378
      Otherwise we need to generate one more, to simulate the situation when
7379
      the minimum is zero. */
7380
7381
5.93k
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7382
1.96k
        {
7383
1.96k
        int replicate = repeat_min;
7384
7385
1.96k
        if (repeat_min == repeat_max) replicate--;
7386
7387
        /* In the pre-compile phase, we don't actually do the replication. We
7388
        just adjust the length as if we had. Do some paranoid checks for
7389
        potential integer overflow. */
7390
7391
1.96k
        if (lengthptr != NULL)
7392
985
          {
7393
985
          PCRE2_SIZE delta;
7394
985
          if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||
7395
985
              OFLOW_MAX - *lengthptr < delta)
7396
0
            {
7397
0
            *errorcodeptr = ERR20;
7398
0
            return 0;
7399
0
            }
7400
985
          *lengthptr += delta;
7401
985
          }
7402
4.02k
        else for (int i = 0; i < replicate; i++)
7403
3.05k
          {
7404
3.05k
          memcpy(code, previous, CU2BYTES(length_prevgroup));
7405
3.05k
          previous = code;
7406
3.05k
          code += length_prevgroup;
7407
3.05k
          }
7408
7409
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7410
        the counts and fall through. */
7411
7412
1.96k
        if (repeat_min == repeat_max) break;
7413
318
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7414
318
        repeat_min = 0;
7415
318
        }
7416
7417
      /* Wrap the recursion call in OP_BRA brackets. */
7418
4.28k
        {
7419
4.28k
        PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;
7420
7421
4.28k
        (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));
7422
4.28k
        op_previous = *previous = OP_BRA;
7423
4.28k
        PUT(previous, 1, 1 + LINK_SIZE + length);
7424
4.28k
        previous[1 + LINK_SIZE + length] = OP_KET;
7425
4.28k
        PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);
7426
4.28k
        }
7427
4.28k
      code += 2 + 2 * LINK_SIZE;
7428
4.28k
      length_prevgroup += 2 + 2 * LINK_SIZE;
7429
4.28k
      group_return = -1;  /* Set "may match empty string" */
7430
7431
      /* Now treat as a repeated OP_BRA. */
7432
      /* Fall through */
7433
7434
      /* If previous was a bracket group, we may have to replicate it in
7435
      certain cases. Note that at this point we can encounter only the "basic"
7436
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7437
      converted into the more special varieties such as BRAPOS and SBRA.
7438
      Originally, PCRE did not allow repetition of assertions, but now it does,
7439
      for Perl compatibility. */
7440
7441
9.52k
      case OP_ASSERT:
7442
16.3k
      case OP_ASSERT_NOT:
7443
21.5k
      case OP_ASSERT_NA:
7444
22.7k
      case OP_ASSERTBACK:
7445
28.1k
      case OP_ASSERTBACK_NOT:
7446
29.0k
      case OP_ASSERTBACK_NA:
7447
29.0k
      case OP_ASSERT_SCS:
7448
32.6k
      case OP_ONCE:
7449
33.1k
      case OP_SCRIPT_RUN:
7450
50.8k
      case OP_BRA:
7451
102k
      case OP_CBRA:
7452
110k
      case OP_COND:
7453
110k
        {
7454
110k
        int len = (int)(code - previous);
7455
110k
        PCRE2_UCHAR *bralink = NULL;
7456
110k
        PCRE2_UCHAR *brazeroptr = NULL;
7457
7458
110k
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7459
475
          goto END_REPEAT;
7460
7461
        /* Repeating a DEFINE group (or any group where the condition is always
7462
        FALSE and there is only one branch) is pointless, but Perl allows the
7463
        syntax, so we just ignore the repeat. */
7464
7465
110k
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7466
110k
            previous[GET(previous, 1)] != OP_ALT)
7467
169
          goto END_REPEAT;
7468
7469
        /* Perl allows all assertions to be quantified, and when they contain
7470
        capturing parentheses and/or are optional there are potential uses for
7471
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7472
        invalid grounds that further repetition was never useful. This was
7473
        always a bit pointless, since an assertion could be wrapped with a
7474
        repeated group to achieve the effect. General repetition is now
7475
        permitted, but if the maximum is unlimited it is set to one more than
7476
        the minimum. */
7477
7478
109k
        if (op_previous < OP_ONCE)    /* Assertion */
7479
24.6k
          {
7480
24.6k
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7481
24.6k
          }
7482
7483
        /* The case of a zero minimum is special because of the need to stick
7484
        OP_BRAZERO in front of it, and because the group appears once in the
7485
        data, whereas in other cases it appears the minimum number of times. For
7486
        this reason, it is simplest to treat this case separately, as otherwise
7487
        the code gets far too messy. There are several special subcases when the
7488
        minimum is zero. */
7489
7490
109k
        if (repeat_min == 0)
7491
23.7k
          {
7492
          /* If the maximum is also zero, we used to just omit the group from
7493
          the output altogether, like this:
7494
7495
          ** if (repeat_max == 0)
7496
          **   {
7497
          **   code = previous;
7498
          **   goto END_REPEAT;
7499
          **   }
7500
7501
          However, that fails when a group or a subgroup within it is
7502
          referenced as a subroutine from elsewhere in the pattern, so now we
7503
          stick in OP_SKIPZERO in front of it so that it is skipped on
7504
          execution. As we don't have a list of which groups are referenced, we
7505
          cannot do this selectively.
7506
7507
          If the maximum is 1 or unlimited, we just have to stick in the
7508
          BRAZERO and do no more at this point. */
7509
7510
23.7k
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7511
19.8k
            {
7512
19.8k
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7513
19.8k
            code++;
7514
19.8k
            if (repeat_max == 0)
7515
806
              {
7516
806
              *previous++ = OP_SKIPZERO;
7517
806
              goto END_REPEAT;
7518
806
              }
7519
19.0k
            brazeroptr = previous;    /* Save for possessive optimizing */
7520
19.0k
            *previous++ = OP_BRAZERO + repeat_type;
7521
19.0k
            }
7522
7523
          /* If the maximum is greater than 1 and limited, we have to replicate
7524
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7525
          The first one has to be handled carefully because it's the original
7526
          copy, which has to be moved up. The remainder can be handled by code
7527
          that is common with the non-zero minimum case below. We have to
7528
          adjust the value or repeat_max, since one less copy is required. */
7529
7530
3.90k
          else
7531
3.90k
            {
7532
3.90k
            int linkoffset;
7533
3.90k
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7534
3.90k
            code += 2 + LINK_SIZE;
7535
3.90k
            *previous++ = OP_BRAZERO + repeat_type;
7536
3.90k
            *previous++ = OP_BRA;
7537
7538
            /* We chain together the bracket link offset fields that have to be
7539
            filled in later when the ends of the brackets are reached. */
7540
7541
3.90k
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7542
3.90k
            bralink = previous;
7543
3.90k
            PUTINC(previous, 0, linkoffset);
7544
3.90k
            }
7545
7546
22.9k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7547
22.9k
          }
7548
7549
        /* If the minimum is greater than zero, replicate the group as many
7550
        times as necessary, and adjust the maximum to the number of subsequent
7551
        copies that we need. */
7552
7553
86.1k
        else
7554
86.1k
          {
7555
86.1k
          if (repeat_min > 1)
7556
40.6k
            {
7557
            /* In the pre-compile phase, we don't actually do the replication.
7558
            We just adjust the length as if we had. Do some paranoid checks for
7559
            potential integer overflow. */
7560
7561
40.6k
            if (lengthptr != NULL)
7562
20.6k
              {
7563
20.6k
              PCRE2_SIZE delta;
7564
20.6k
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7565
20.6k
                                 (int)length_prevgroup) ||
7566
20.6k
                  OFLOW_MAX - *lengthptr < delta)
7567
6
                {
7568
6
                *errorcodeptr = ERR20;
7569
6
                return 0;
7570
6
                }
7571
20.5k
              *lengthptr += delta;
7572
20.5k
              }
7573
7574
            /* This is compiling for real. If there is a set first code unit
7575
            for the group, and we have not yet set a "required code unit", set
7576
            it. */
7577
7578
20.0k
            else
7579
20.0k
              {
7580
20.0k
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7581
293
                {
7582
293
                reqcu = firstcu;
7583
293
                reqcuflags = firstcuflags;
7584
293
                }
7585
2.28M
              for (uint32_t i = 1; i < repeat_min; i++)
7586
2.26M
                {
7587
2.26M
                memcpy(code, previous, CU2BYTES(len));
7588
2.26M
                code += len;
7589
2.26M
                }
7590
20.0k
              }
7591
40.6k
            }
7592
7593
86.1k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7594
86.1k
          }
7595
7596
        /* This code is common to both the zero and non-zero minimum cases. If
7597
        the maximum is limited, it replicates the group in a nested fashion,
7598
        remembering the bracket starts on a stack. In the case of a zero
7599
        minimum, the first one was set up above. In all cases the repeat_max
7600
        now specifies the number of additional copies needed. Again, we must
7601
        remember to replicate entries on the forward reference list. */
7602
7603
109k
        if (repeat_max != REPEAT_UNLIMITED)
7604
58.3k
          {
7605
          /* In the pre-compile phase, we don't actually do the replication. We
7606
          just adjust the length as if we had. For each repetition we must add
7607
          1 to the length for BRAZERO and for all but the last repetition we
7608
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7609
          paranoid checks to avoid integer overflow. */
7610
7611
58.3k
          if (lengthptr != NULL && repeat_max > 0)
7612
7.04k
            {
7613
7.04k
            PCRE2_SIZE delta;
7614
7.04k
            if (PRIV(ckd_smul)(&delta, repeat_max,
7615
7.04k
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7616
7.04k
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7617
3
              {
7618
3
              *errorcodeptr = ERR20;
7619
3
              return 0;
7620
3
              }
7621
7.04k
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7622
7.04k
            *lengthptr += delta;
7623
7.04k
            }
7624
7625
          /* This is compiling for real */
7626
7627
710k
          else for (uint32_t i = repeat_max; i >= 1; i--)
7628
658k
            {
7629
658k
            *code++ = OP_BRAZERO + repeat_type;
7630
7631
            /* All but the final copy start a new nesting, maintaining the
7632
            chain of brackets outstanding. */
7633
7634
658k
            if (i != 1)
7635
652k
              {
7636
652k
              int linkoffset;
7637
652k
              *code++ = OP_BRA;
7638
652k
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7639
652k
              bralink = code;
7640
652k
              PUTINC(code, 0, linkoffset);
7641
652k
              }
7642
7643
658k
            memcpy(code, previous, CU2BYTES(len));
7644
658k
            code += len;
7645
658k
            }
7646
7647
          /* Now chain through the pending brackets, and fill in their length
7648
          fields (which are holding the chain links pro tem). */
7649
7650
714k
          while (bralink != NULL)
7651
655k
            {
7652
655k
            int oldlinkoffset;
7653
655k
            int linkoffset = (int)(code - bralink + 1);
7654
655k
            PCRE2_UCHAR *bra = code - linkoffset;
7655
655k
            oldlinkoffset = GET(bra, 1);
7656
655k
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7657
655k
            *code++ = OP_KET;
7658
655k
            PUTINC(code, 0, linkoffset);
7659
655k
            PUT(bra, 1, linkoffset);
7660
655k
            }
7661
58.3k
          }
7662
7663
        /* If the maximum is unlimited, set a repeater in the final copy. For
7664
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7665
        possessively repeated ONCE brackets can be converted into non-capturing
7666
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7667
        saves having to deal with possessive ONCEs specially.
7668
7669
        Otherwise, when we are doing the actual compile phase, check to see
7670
        whether this group is one that could match an empty string. If so,
7671
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7672
        that runtime checking can be done. [This check is also applied to ONCE
7673
        and SCRIPT_RUN groups at runtime, but in a different way.]
7674
7675
        Then, if the quantifier was possessive and the bracket is not a
7676
        conditional, we convert the BRA code to the POS form, and the KET code
7677
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7678
        kind of subpattern at both the start and at the end.) The use of
7679
        special opcodes makes it possible to reduce greatly the stack usage in
7680
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7681
        OP_BRAPOSZERO.
7682
7683
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7684
        flag so that the default action below, of wrapping everything inside
7685
        atomic brackets, does not happen. When the minimum is greater than 1,
7686
        there will be earlier copies of the group, and so we still have to wrap
7687
        the whole thing. */
7688
7689
50.8k
        else
7690
50.8k
          {
7691
50.8k
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7692
50.8k
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7693
7694
          /* Convert possessive ONCE brackets to non-capturing */
7695
7696
50.8k
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7697
7698
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7699
          to do is to set the KET. */
7700
7701
50.8k
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7702
2.27k
            *ketcode = OP_KETRMAX + repeat_type;
7703
7704
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7705
          (which have been converted to non-capturing above). */
7706
7707
48.5k
          else
7708
48.5k
            {
7709
            /* In the compile phase, adjust the opcode if the group can match
7710
            an empty string. For a conditional group with only one branch, the
7711
            value of group_return will not show "could be empty", so we must
7712
            check that separately. */
7713
7714
48.5k
            if (lengthptr == NULL)
7715
24.0k
              {
7716
24.0k
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7717
24.0k
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7718
654
                *bracode = OP_SCOND;
7719
24.0k
              }
7720
7721
            /* Handle possessive quantifiers. */
7722
7723
48.5k
            if (possessive_quantifier)
7724
27.6k
              {
7725
              /* For COND brackets, we wrap the whole thing in a possessively
7726
              repeated non-capturing bracket, because we have not invented POS
7727
              versions of the COND opcodes. */
7728
7729
27.6k
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7730
3.56k
                {
7731
3.56k
                int nlen = (int)(code - bracode);
7732
3.56k
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7733
3.56k
                code += 1 + LINK_SIZE;
7734
3.56k
                nlen += 1 + LINK_SIZE;
7735
3.56k
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7736
3.56k
                *code++ = OP_KETRPOS;
7737
3.56k
                PUTINC(code, 0, nlen);
7738
3.56k
                PUT(bracode, 1, nlen);
7739
3.56k
                }
7740
7741
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7742
7743
24.0k
              else
7744
24.0k
                {
7745
24.0k
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7746
24.0k
                *ketcode = OP_KETRPOS;
7747
24.0k
                }
7748
7749
              /* If the minimum is zero, mark it as possessive, then unset the
7750
              possessive flag when the minimum is 0 or 1. */
7751
7752
27.6k
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7753
27.6k
              if (repeat_min < 2) possessive_quantifier = FALSE;
7754
27.6k
              }
7755
7756
            /* Non-possessive quantifier */
7757
7758
20.9k
            else *ketcode = OP_KETRMAX + repeat_type;
7759
48.5k
            }
7760
50.8k
          }
7761
109k
        }
7762
109k
      break;
7763
7764
      /* If previous was a character type match (\d or similar), abolish it and
7765
      create a suitable repeat item. The code is shared with single-character
7766
      repeats by setting op_type to add a suitable offset into repeat_type.
7767
      Note the the Unicode property types will be present only when
7768
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7769
      here because it just makes it horribly messy. */
7770
7771
395k
      default:
7772
395k
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7773
0
        {
7774
0
        PCRE2_DEBUG_UNREACHABLE();
7775
0
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7776
0
        return 0;
7777
0
        }
7778
395k
      else
7779
395k
        {
7780
395k
        int prop_type, prop_value;
7781
395k
        PCRE2_UCHAR *oldcode;
7782
7783
395k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7784
7785
395k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7786
395k
        mclength = 0;                         /* Not a character */
7787
7788
395k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7789
68.7k
          {
7790
68.7k
          prop_type = previous[1];
7791
68.7k
          prop_value = previous[2];
7792
68.7k
          }
7793
326k
        else
7794
326k
          {
7795
          /* Come here from just above with a character in mcbuffer/mclength.
7796
          You must also set op_type before the jump. */
7797
1.43M
          OUTPUT_SINGLE_REPEAT:
7798
1.43M
          prop_type = prop_value = -1;
7799
1.43M
          }
7800
7801
        /* At this point, if prop_type == prop_value == -1 we either have a
7802
        character in mcbuffer when mclength is greater than zero, or we have
7803
        mclength zero, in which case there is a non-property character type in
7804
        op_previous. If prop_type/value are not negative, we have a property
7805
        character type in op_previous. */
7806
7807
1.50M
        oldcode = code;                   /* Save where we were */
7808
1.50M
        code = previous;                  /* Usually overwrite previous item */
7809
7810
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7811
        this case, so we do too - by simply omitting the item altogether. */
7812
7813
1.50M
        if (repeat_max == 0) goto END_REPEAT;
7814
7815
        /* Combine the op_type with the repeat_type */
7816
7817
1.49M
        repeat_type += op_type;
7818
7819
        /* A minimum of zero is handled either as the special case * or ?, or as
7820
        an UPTO, with the maximum given. */
7821
7822
1.49M
        if (repeat_min == 0)
7823
892k
          {
7824
892k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7825
448k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7826
78.5k
          else
7827
78.5k
            {
7828
78.5k
            *code++ = OP_UPTO + repeat_type;
7829
78.5k
            PUT2INC(code, 0, repeat_max);
7830
78.5k
            }
7831
892k
          }
7832
7833
        /* A repeat minimum of 1 is optimized into some special cases. If the
7834
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7835
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7836
        one less than the maximum. */
7837
7838
606k
        else if (repeat_min == 1)
7839
524k
          {
7840
524k
          if (repeat_max == REPEAT_UNLIMITED)
7841
494k
            *code++ = OP_PLUS + repeat_type;
7842
29.5k
          else
7843
29.5k
            {
7844
29.5k
            code = oldcode;  /* Leave previous item in place */
7845
29.5k
            if (repeat_max == 1) goto END_REPEAT;
7846
29.5k
            *code++ = OP_UPTO + repeat_type;
7847
29.5k
            PUT2INC(code, 0, repeat_max - 1);
7848
29.5k
            }
7849
524k
          }
7850
7851
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7852
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7853
7854
82.5k
        else
7855
82.5k
          {
7856
82.5k
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7857
82.5k
          PUT2INC(code, 0, repeat_min);
7858
7859
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7860
          and then generate the second opcode. For a repeated Unicode property
7861
          match, there are two extra values that define the required property,
7862
          and mclength is set zero to indicate this. */
7863
7864
82.5k
          if (repeat_max != repeat_min)
7865
29.5k
            {
7866
29.5k
            if (mclength > 0)
7867
22.1k
              {
7868
22.1k
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7869
22.1k
              code += mclength;
7870
22.1k
              }
7871
7.38k
            else
7872
7.38k
              {
7873
7.38k
              *code++ = op_previous;
7874
7.38k
              if (prop_type >= 0)
7875
1.58k
                {
7876
1.58k
                *code++ = prop_type;
7877
1.58k
                *code++ = prop_value;
7878
1.58k
                }
7879
7.38k
              }
7880
7881
            /* Now set up the following opcode */
7882
7883
29.5k
            if (repeat_max == REPEAT_UNLIMITED)
7884
20.8k
              *code++ = OP_STAR + repeat_type;
7885
8.68k
            else
7886
8.68k
              {
7887
8.68k
              repeat_max -= repeat_min;
7888
8.68k
              if (repeat_max == 1)
7889
939
                {
7890
939
                *code++ = OP_QUERY + repeat_type;
7891
939
                }
7892
7.74k
              else
7893
7.74k
                {
7894
7.74k
                *code++ = OP_UPTO + repeat_type;
7895
7.74k
                PUT2INC(code, 0, repeat_max);
7896
7.74k
                }
7897
8.68k
              }
7898
29.5k
            }
7899
82.5k
          }
7900
7901
        /* Fill in the character or character type for the final opcode. */
7902
7903
1.49M
        if (mclength > 0)
7904
1.10M
          {
7905
1.10M
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7906
1.10M
          code += mclength;
7907
1.10M
          }
7908
394k
        else
7909
394k
          {
7910
394k
          *code++ = op_previous;
7911
394k
          if (prop_type >= 0)
7912
68.4k
            {
7913
68.4k
            *code++ = prop_type;
7914
68.4k
            *code++ = prop_value;
7915
68.4k
            }
7916
394k
          }
7917
1.49M
        }
7918
1.49M
      break;
7919
1.94M
      }  /* End of switch on different op_previous values */
7920
7921
7922
    /* If the character following a repeat is '+', possessive_quantifier is
7923
    TRUE. For some opcodes, there are special alternative opcodes for this
7924
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
7925
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
7926
    Sun's Java package, but the special opcodes can optimize it.
7927
7928
    Some (but not all) possessively repeated subpatterns have already been
7929
    completely handled in the code just above. For them, possessive_quantifier
7930
    is always FALSE at this stage. Note that the repeated item starts at
7931
    tempcode, not at previous, which might be the first part of a string whose
7932
    (former) last char we repeated. */
7933
7934
1.93M
    if (possessive_quantifier)
7935
228k
      {
7936
228k
      int len;
7937
7938
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
7939
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
7940
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
7941
      remains is greater than zero, there's a further opcode that can be
7942
      handled. If not, do nothing, leaving the EXACT alone. */
7943
7944
228k
      switch(*tempcode)
7945
228k
        {
7946
2.44k
        case OP_TYPEEXACT:
7947
2.44k
        tempcode += PRIV(OP_lengths)[*tempcode] +
7948
2.44k
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
7949
2.44k
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
7950
2.44k
        break;
7951
7952
        /* CHAR opcodes are used for exacts whose count is 1. */
7953
7954
3.78k
        case OP_CHAR:
7955
5.27k
        case OP_CHARI:
7956
15.4k
        case OP_NOT:
7957
21.7k
        case OP_NOTI:
7958
30.3k
        case OP_EXACT:
7959
32.9k
        case OP_EXACTI:
7960
40.1k
        case OP_NOTEXACT:
7961
47.2k
        case OP_NOTEXACTI:
7962
47.2k
        tempcode += PRIV(OP_lengths)[*tempcode];
7963
47.2k
#ifdef SUPPORT_UNICODE
7964
47.2k
        if (utf && HAS_EXTRALEN(tempcode[-1]))
7965
8.90k
          tempcode += GET_EXTRALEN(tempcode[-1]);
7966
47.2k
#endif
7967
47.2k
        break;
7968
7969
        /* For the class opcodes, the repeat operator appears at the end;
7970
        adjust tempcode to point to it. */
7971
7972
25.6k
        case OP_CLASS:
7973
86.5k
        case OP_NCLASS:
7974
86.5k
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
7975
86.5k
        break;
7976
7977
0
#ifdef SUPPORT_WIDE_CHARS
7978
16.5k
        case OP_XCLASS:
7979
16.7k
        case OP_ECLASS:
7980
16.7k
        tempcode += GET(tempcode, 1);
7981
16.7k
        break;
7982
228k
#endif
7983
228k
        }
7984
7985
      /* If tempcode is equal to code (which points to the end of the repeated
7986
      item), it means we have skipped an EXACT item but there is no following
7987
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
7988
      all other cases, tempcode will be pointing to the repeat opcode, and will
7989
      be less than code, so the value of len will be greater than 0. */
7990
7991
228k
      len = (int)(code - tempcode);
7992
228k
      if (len > 0)
7993
217k
        {
7994
217k
        unsigned int repcode = *tempcode;
7995
7996
        /* There is a table for possessifying opcodes, all of which are less
7997
        than OP_CALLOUT. A zero entry means there is no possessified version.
7998
        */
7999
8000
217k
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
8001
211k
          *tempcode = opcode_possessify[repcode];
8002
8003
        /* For opcode without a special possessified version, wrap the item in
8004
        ONCE brackets. */
8005
8006
6.41k
        else
8007
6.41k
          {
8008
6.41k
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
8009
6.41k
          code += 1 + LINK_SIZE;
8010
6.41k
          len += 1 + LINK_SIZE;
8011
6.41k
          tempcode[0] = OP_ONCE;
8012
6.41k
          *code++ = OP_KET;
8013
6.41k
          PUTINC(code, 0, len);
8014
6.41k
          PUT(tempcode, 1, len);
8015
6.41k
          }
8016
217k
        }
8017
228k
      }
8018
8019
    /* We set the "follows varying string" flag for subsequently encountered
8020
    reqcus if it isn't already set and we have just passed a varying length
8021
    item. */
8022
8023
1.94M
    END_REPEAT:
8024
1.94M
    cb->req_varyopt |= reqvary;
8025
1.94M
    break;
8026
8027
8028
    /* ===================================================================*/
8029
    /* Handle a 32-bit data character with a value greater than META_END. */
8030
8031
0
    case META_BIGVALUE:
8032
0
    pptr++;
8033
0
    goto NORMAL_CHAR;
8034
8035
8036
    /* ===============================================================*/
8037
    /* Handle a back reference by number, which is the meta argument. The
8038
    pattern offsets for back references to group numbers less than 10 are held
8039
    in a special vector, to avoid using more than two parsed pattern elements
8040
    in 64-bit environments. We only need the offset to the first occurrence,
8041
    because if that doesn't fail, subsequent ones will also be OK. */
8042
8043
28.4k
    case META_BACKREF:
8044
28.4k
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8045
2.84k
      else GETPLUSOFFSET(offset, pptr);
8046
8047
28.4k
    if (meta_arg > cb->bracount)
8048
745
      {
8049
745
      cb->erroroffset = offset;
8050
745
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8051
745
      return 0;
8052
745
      }
8053
8054
    /* Come here from named backref handling when the reference is to a
8055
    single group (that is, not to a duplicated name). The back reference
8056
    data will have already been updated. We must disable firstcu if not
8057
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8058
    later. */
8059
8060
28.5k
    HANDLE_SINGLE_REFERENCE:
8061
28.5k
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8062
28.5k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8063
28.5k
    PUT2INC(code, 0, meta_arg);
8064
28.5k
    if ((options & PCRE2_CASELESS) != 0)
8065
8.84k
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8066
8.68k
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
8067
8.84k
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
8068
8.84k
                 REFI_FLAG_TURKISH_CASING : 0);
8069
8070
    /* Update the map of back references, and keep the highest one. We
8071
    could do this in parse_regex() for numerical back references, but not
8072
    for named back references, because we don't know the numbers to which
8073
    named back references refer. So we do it all in this function. */
8074
8075
28.5k
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8076
28.5k
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8077
28.5k
    break;
8078
8079
8080
    /* ===============================================================*/
8081
    /* Handle recursion by inserting the number of the called group (which is
8082
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8083
    scanned and these numbers are replaced by offsets within the pattern. It is
8084
    done like this to avoid problems with forward references and adjusting
8085
    offsets when groups are duplicated and moved (as discovered in previous
8086
    implementations). Note that a recursion does not have a set first
8087
    character. */
8088
8089
75.3k
    case META_RECURSE:
8090
75.3k
    GETPLUSOFFSET(offset, pptr);
8091
75.3k
    if (meta_arg > cb->bracount)
8092
154
      {
8093
154
      cb->erroroffset = offset;
8094
154
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8095
154
      return 0;
8096
154
      }
8097
75.2k
    HANDLE_NUMERICAL_RECURSION:
8098
75.2k
    *code = OP_RECURSE;
8099
75.2k
    PUT(code, 1, meta_arg);
8100
75.2k
    code += 1 + LINK_SIZE;
8101
    /* Repeat processing requires this information to
8102
    determine the real length in pre-compile phase. */
8103
75.2k
    length_prevgroup = 1 + LINK_SIZE;
8104
8105
75.2k
    if (META_CODE(pptr[1]) == META_OFFSET ||
8106
75.2k
        META_CODE(pptr[1]) == META_CAPTURE_NAME ||
8107
75.2k
        META_CODE(pptr[1]) == META_CAPTURE_NUMBER)
8108
695
      {
8109
695
      recurse_arguments *args;
8110
8111
695
      if (lengthptr != NULL)
8112
369
        {
8113
369
        if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))
8114
39
          return 0;
8115
8116
330
        args = (recurse_arguments*)cb->last_data;
8117
330
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8118
330
        *lengthptr += (args->size * (1 + IMM2_SIZE));
8119
330
        pptr += args->skip_size;
8120
330
        }
8121
326
      else
8122
326
        {
8123
326
        uint16_t *current, *end;
8124
8125
326
        args = (recurse_arguments*)cb->first_data;
8126
326
        PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);
8127
8128
326
        current = (uint16_t*)(args + 1);
8129
326
        end = current + args->size;
8130
326
        PCRE2_ASSERT(end > current);
8131
8132
326
        do
8133
326
          {
8134
326
          code[0] = OP_CREF;
8135
326
          PUT2(code, 1, *current);
8136
326
          code += 1 + IMM2_SIZE;
8137
326
          }
8138
326
        while (++current < end);
8139
8140
326
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8141
326
        pptr += args->skip_size;
8142
326
        cb->first_data = args->header.next;
8143
326
        cb->cx->memctl.free(args, cb->cx->memctl.memory_data);
8144
326
        }
8145
695
      }
8146
8147
75.2k
    groupsetfirstcu = FALSE;
8148
75.2k
    cb->had_recurse = TRUE;
8149
75.2k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8150
75.2k
    zerofirstcu = firstcu;
8151
75.2k
    zerofirstcuflags = firstcuflags;
8152
75.2k
    break;
8153
8154
8155
    /* ===============================================================*/
8156
    /* Handle capturing parentheses; the number is the meta argument. */
8157
8158
311k
    case META_CAPTURE:
8159
311k
    bravalue = OP_CBRA;
8160
311k
    skipunits = IMM2_SIZE;
8161
311k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8162
311k
    cb->lastcapture = meta_arg;
8163
311k
    goto GROUP_PROCESS_NOTE_EMPTY;
8164
8165
8166
    /* ===============================================================*/
8167
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8168
    arranged to be the same as the corresponding OP_values in the default case
8169
    when PCRE2_UCP is not set (which is the only case in which they will appear
8170
    here).
8171
8172
    Note: \Q and \E are never seen here, as they were dealt with in
8173
    parse_pattern(). Neither are numerical back references or recursions, which
8174
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8175
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8176
    META_RECURSE_BYNAME. */
8177
8178
1.11M
    case META_ESCAPE:
8179
8180
    /* We can test for escape sequences that consume a character because their
8181
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8182
    are ever created. For these sequences, we disable the setting of a first
8183
    character if it hasn't already been set. */
8184
8185
1.11M
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8186
1.01M
      {
8187
1.01M
      matched_char = TRUE;
8188
1.01M
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8189
1.01M
      }
8190
8191
    /* Set values to reset to if this is followed by a zero repeat. */
8192
8193
1.11M
    zerofirstcu = firstcu;
8194
1.11M
    zerofirstcuflags = firstcuflags;
8195
1.11M
    zeroreqcu = reqcu;
8196
1.11M
    zeroreqcuflags = reqcuflags;
8197
8198
    /* If Unicode is not supported, \P and \p are not allowed and are
8199
    faulted at parse time, so will never appear here. */
8200
8201
1.11M
#ifdef SUPPORT_UNICODE
8202
1.11M
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8203
141k
      {
8204
141k
      uint32_t ptype = *(++pptr) >> 16;
8205
141k
      uint32_t pdata = *pptr & 0xffff;
8206
8207
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8208
      converted to the general characteristic L&. That is, upper, lower, and
8209
      title case letters are all conflated. */
8210
8211
141k
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8212
141k
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8213
230
        {
8214
230
        ptype = PT_LAMP;
8215
230
        pdata = 0;
8216
230
        }
8217
8218
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8219
      is compiled to [] so as to benefit from the auto-anchoring code. */
8220
8221
141k
      if (ptype == PT_ANY)
8222
1.32k
        {
8223
1.32k
        if (meta_arg == ESC_P)
8224
1.22k
          {
8225
1.22k
          *code++ = OP_CLASS;
8226
1.22k
          memset(code, 0, 32);
8227
1.22k
          code += 32 / sizeof(PCRE2_UCHAR);
8228
1.22k
          }
8229
99
        else
8230
99
          *code++ = OP_ALLANY;
8231
1.32k
        }
8232
140k
      else
8233
140k
        {
8234
140k
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8235
140k
        *code++ = ptype;
8236
140k
        *code++ = pdata;
8237
140k
        }
8238
141k
      break;  /* End META_ESCAPE */
8239
141k
      }
8240
968k
#endif
8241
8242
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8243
    done. However, there's an option, in case anyone was relying on it. */
8244
8245
968k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8246
968k
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8247
10
      {
8248
10
      *errorcodeptr = ERR99;
8249
10
      return 0;
8250
10
      }
8251
8252
    /* For the rest (including \X when Unicode is supported - if not it's
8253
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8254
    not set; if it is set, most of them do not show up here because they are
8255
    converted into Unicode property tests in parse_regex().
8256
8257
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8258
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8259
    There are special UCP codes for \B and \b which are used in UCP mode unless
8260
    "word" matching is being forced to ASCII.
8261
8262
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8263
    if it does. */
8264
8265
968k
    switch(meta_arg)
8266
968k
      {
8267
0
      case ESC_C:
8268
0
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8269
#if PCRE2_CODE_UNIT_WIDTH == 32
8270
      meta_arg = OP_ALLANY;
8271
      (void)utf; /* Avoid compiler warning. */
8272
#else
8273
0
      if (!utf) meta_arg = OP_ALLANY;
8274
0
#endif
8275
0
      break;
8276
8277
21.7k
      case ESC_B:
8278
48.8k
      case ESC_b:
8279
48.8k
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8280
12.2k
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8281
12.2k
          OP_UCP_WORD_BOUNDARY;
8282
      /* Fall through */
8283
8284
63.5k
      case ESC_A:
8285
63.5k
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8286
63.5k
      break;
8287
968k
      }
8288
8289
968k
    *code++ = meta_arg;
8290
968k
    break;  /* End META_ESCAPE */
8291
8292
8293
    /* ===================================================================*/
8294
    /* Handle an unrecognized meta value. A parsed pattern value less than
8295
    META_END is a literal. Otherwise we have a problem. */
8296
8297
18.6M
    default:
8298
18.6M
    if (meta >= META_END)
8299
0
      {
8300
0
      PCRE2_DEBUG_UNREACHABLE();
8301
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8302
0
      return 0;
8303
0
      }
8304
8305
    /* Handle a literal character. We come here by goto in the case of a
8306
    32-bit, non-UTF character whose value is greater than META_END. */
8307
8308
18.6M
    NORMAL_CHAR:
8309
18.6M
    meta = *pptr;     /* Get the full 32 bits */
8310
18.6M
    NORMAL_CHAR_SET:  /* Character is already in meta */
8311
18.6M
    matched_char = TRUE;
8312
8313
    /* For caseless UTF or UCP mode, check whether this character has more than
8314
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8315
    When casing restrictions apply, ignore caseless sets that start with an
8316
    ASCII character. If the character is affected by the special Turkish rules,
8317
    hardcode the matching characters using a caseset. */
8318
8319
18.6M
#ifdef SUPPORT_UNICODE
8320
18.6M
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8321
4.00M
      {
8322
4.00M
      uint32_t caseset;
8323
8324
4.00M
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8325
4.00M
            PCRE2_EXTRA_TURKISH_CASING &&
8326
4.00M
          UCD_ANY_I(meta))
8327
0
        {
8328
0
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8329
0
        }
8330
4.00M
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8331
4.00M
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8332
4.00M
               PRIV(ucd_caseless_sets)[caseset] < 128)
8333
2.30k
        {
8334
2.30k
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8335
2.30k
        }
8336
8337
4.00M
      if (caseset != 0)
8338
209k
        {
8339
209k
        *code++ = OP_PROP;
8340
209k
        *code++ = PT_CLIST;
8341
209k
        *code++ = caseset;
8342
209k
        if (firstcuflags == REQ_UNSET)
8343
4.14k
          firstcuflags = zerofirstcuflags = REQ_NONE;
8344
209k
        break;  /* End handling this meta item */
8345
209k
        }
8346
4.00M
      }
8347
18.4M
#endif
8348
8349
    /* Caseful matches, or caseless and not one of the multicase characters. We
8350
    come here by goto in the case of a positive class that contains only
8351
    case-partners of a character with just two cases; matched_char has already
8352
    been set TRUE and options fudged if necessary. */
8353
8354
18.4M
    CLASS_CASELESS_CHAR:
8355
8356
    /* Get the character's code units into mcbuffer, with the length in
8357
    mclength. When not in UTF mode, the length is always 1. */
8358
8359
18.4M
#ifdef SUPPORT_UNICODE
8360
18.4M
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8361
16.0M
#endif
8362
16.0M
      {
8363
16.0M
      mclength = 1;
8364
16.0M
      mcbuffer[0] = meta;
8365
16.0M
      }
8366
8367
    /* Generate the appropriate code */
8368
8369
18.4M
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8370
18.4M
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8371
18.4M
    code += mclength;
8372
8373
    /* Remember if \r or \n were seen */
8374
8375
18.4M
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8376
189k
      cb->external_flags |= PCRE2_HASCRORLF;
8377
8378
    /* Set the first and required code units appropriately. If no previous
8379
    first code unit, set it from this character, but revert to none on a zero
8380
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8381
    a zero repeat. */
8382
8383
18.4M
    if (firstcuflags == REQ_UNSET)
8384
538k
      {
8385
538k
      zerofirstcuflags = REQ_NONE;
8386
538k
      zeroreqcu = reqcu;
8387
538k
      zeroreqcuflags = reqcuflags;
8388
8389
      /* If the character is more than one code unit long, we can set a single
8390
      firstcu only if it is not to be matched caselessly. Multiple possible
8391
      starting code units may be picked up later in the studying code. */
8392
8393
538k
      if (mclength == 1 || req_caseopt == 0)
8394
534k
        {
8395
534k
        firstcu = mcbuffer[0];
8396
534k
        firstcuflags = req_caseopt;
8397
534k
        if (mclength != 1)
8398
2.46k
          {
8399
2.46k
          reqcu = code[-1];
8400
2.46k
          reqcuflags = cb->req_varyopt;
8401
2.46k
          }
8402
534k
        }
8403
3.64k
      else firstcuflags = reqcuflags = REQ_NONE;
8404
538k
      }
8405
8406
    /* firstcu was previously set; we can set reqcu only if the length is
8407
    1 or the matching is caseful. */
8408
8409
17.8M
    else
8410
17.8M
      {
8411
17.8M
      zerofirstcu = firstcu;
8412
17.8M
      zerofirstcuflags = firstcuflags;
8413
17.8M
      zeroreqcu = reqcu;
8414
17.8M
      zeroreqcuflags = reqcuflags;
8415
17.8M
      if (mclength == 1 || req_caseopt == 0)
8416
17.8M
        {
8417
17.8M
        reqcu = code[-1];
8418
17.8M
        reqcuflags = req_caseopt | cb->req_varyopt;
8419
17.8M
        }
8420
17.8M
      }
8421
8422
    /* If caselessness was temporarily instated, reset it. */
8423
8424
18.4M
    if (reset_caseful)
8425
418
      {
8426
418
      options &= ~PCRE2_CASELESS;
8427
418
      req_caseopt = 0;
8428
418
      reset_caseful = FALSE;
8429
418
      }
8430
8431
18.4M
    break;    /* End literal character handling */
8432
29.1M
    }         /* End of big switch */
8433
29.1M
  }           /* End of big loop */
8434
8435
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8436
0
return 0;                  /* Avoid compiler warnings */
8437
2.09M
}
8438
8439
8440
8441
/*************************************************
8442
*   Compile regex: a sequence of alternatives    *
8443
*************************************************/
8444
8445
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8446
the closing bracket or META_END. The code variable is pointing at the code unit
8447
into which the BRA operator has been stored. This function is used during the
8448
pre-compile phase when we are trying to find out the amount of memory needed,
8449
as well as during the real compile phase. The value of lengthptr distinguishes
8450
the two phases.
8451
8452
Arguments:
8453
  options           option bits, including any changes for this subpattern
8454
  xoptions          extra option bits, ditto
8455
  codeptr           -> the address of the current code pointer
8456
  pptrptr           -> the address of the current parsed pattern pointer
8457
  errorcodeptr      -> pointer to error code variable
8458
  skipunits         skip this many code units at start (for brackets and OP_COND)
8459
  firstcuptr        place to put the first required code unit
8460
  firstcuflagsptr   place to put the first code unit flags
8461
  reqcuptr          place to put the last required code unit
8462
  reqcuflagsptr     place to put the last required code unit flags
8463
  bcptr             pointer to the chain of currently open branches
8464
  cb                points to the data block with tables pointers etc.
8465
  lengthptr         NULL during the real compile phase
8466
                    points to length accumulator during pre-compile phase
8467
8468
Returns:            0 There has been an error
8469
                   +1 Success, this group must match at least one character
8470
                   -1 Success, this group may match an empty string
8471
*/
8472
8473
static int
8474
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8475
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8476
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8477
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8478
  compile_block *cb, PCRE2_SIZE *lengthptr)
8479
643k
{
8480
643k
PCRE2_UCHAR *code = *codeptr;
8481
643k
PCRE2_UCHAR *last_branch = code;
8482
643k
PCRE2_UCHAR *start_bracket = code;
8483
643k
BOOL lookbehind;
8484
643k
open_capitem capitem;
8485
643k
int capnumber = 0;
8486
643k
int okreturn = 1;
8487
643k
uint32_t *pptr = *pptrptr;
8488
643k
uint32_t firstcu, reqcu;
8489
643k
uint32_t lookbehindlength;
8490
643k
uint32_t lookbehindminlength;
8491
643k
uint32_t firstcuflags, reqcuflags;
8492
643k
PCRE2_SIZE length;
8493
643k
branch_chain bc;
8494
8495
/* If set, call the external function that checks for stack availability. */
8496
8497
643k
if (cb->cx->stack_guard != NULL &&
8498
643k
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8499
0
  {
8500
0
  *errorcodeptr= ERR33;
8501
0
  return 0;
8502
0
  }
8503
8504
/* Miscellaneous initialization */
8505
8506
643k
bc.outer = bcptr;
8507
643k
bc.current_branch = code;
8508
8509
643k
firstcu = reqcu = 0;
8510
643k
firstcuflags = reqcuflags = REQ_UNSET;
8511
8512
/* Accumulate the length for use in the pre-compile phase. Start with the
8513
length of the BRA and KET and any extra code units that are required at the
8514
beginning. We accumulate in a local variable to save frequent testing of
8515
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8516
start and end of each alternative, because compiled items are discarded during
8517
the pre-compile phase so that the workspace is not exceeded. */
8518
8519
643k
length = 2 + 2*LINK_SIZE + skipunits;
8520
8521
/* Remember if this is a lookbehind assertion, and if it is, save its length
8522
and skip over the pattern offset. */
8523
8524
643k
lookbehind = *code == OP_ASSERTBACK ||
8525
643k
             *code == OP_ASSERTBACK_NOT ||
8526
643k
             *code == OP_ASSERTBACK_NA;
8527
8528
643k
if (lookbehind)
8529
28.2k
  {
8530
28.2k
  lookbehindlength = META_DATA(pptr[-1]);
8531
28.2k
  lookbehindminlength = *pptr;
8532
28.2k
  pptr += SIZEOFFSET;
8533
28.2k
  }
8534
614k
else lookbehindlength = lookbehindminlength = 0;
8535
8536
/* If this is a capturing subpattern, add to the chain of open capturing items
8537
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8538
need be tested here; changing this opcode to one of its variants, e.g.
8539
OP_SCBRAPOS, happens later, after the group has been compiled. */
8540
8541
643k
if (*code == OP_CBRA)
8542
311k
  {
8543
311k
  capnumber = GET2(code, 1 + LINK_SIZE);
8544
311k
  capitem.number = capnumber;
8545
311k
  capitem.next = open_caps;
8546
311k
  capitem.assert_depth = cb->assert_depth;
8547
311k
  open_caps = &capitem;
8548
311k
  }
8549
8550
/* Offset is set zero to mark that this bracket is still open */
8551
8552
643k
PUT(code, 1, 0);
8553
643k
code += 1 + LINK_SIZE + skipunits;
8554
8555
/* Loop for each alternative branch */
8556
8557
643k
for (;;)
8558
2.09M
  {
8559
2.09M
  int branch_return;
8560
2.09M
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8561
2.09M
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8562
8563
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8564
  is only a single minimum length for the whole assertion. When the minimum
8565
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8566
  though not necessarily the same length. In this case, the original OP_REVERSE
8567
  can be used. It can also be used if a branch in a variable length lookbehind
8568
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8569
  maximum and minimum values. */
8570
8571
2.09M
  if (lookbehind && lookbehindlength > 0)
8572
34.0k
    {
8573
34.0k
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8574
34.0k
        lookbehindminlength == lookbehindlength)
8575
21.3k
      {
8576
21.3k
      *code++ = OP_REVERSE;
8577
21.3k
      PUT2INC(code, 0, lookbehindlength);
8578
21.3k
      length += 1 + IMM2_SIZE;
8579
21.3k
      }
8580
12.6k
    else
8581
12.6k
      {
8582
12.6k
      *code++ = OP_VREVERSE;
8583
12.6k
      PUT2INC(code, 0, lookbehindminlength);
8584
12.6k
      PUT2INC(code, 0, lookbehindlength);
8585
12.6k
      length += 1 + 2*IMM2_SIZE;
8586
12.6k
      }
8587
34.0k
    }
8588
8589
  /* Now compile the branch; in the pre-compile phase its length gets added
8590
  into the length. */
8591
8592
2.09M
  if ((branch_return =
8593
2.09M
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8594
2.09M
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8595
2.09M
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8596
2.31k
    return 0;
8597
8598
  /* If a branch can match an empty string, so can the whole group. */
8599
8600
2.09M
  if (branch_return < 0) okreturn = -1;
8601
8602
  /* In the real compile phase, there is some post-processing to be done. */
8603
8604
2.09M
  if (lengthptr == NULL)
8605
1.02M
    {
8606
    /* If this is the first branch, the firstcu and reqcu values for the
8607
    branch become the values for the regex. */
8608
8609
1.02M
    if (*last_branch != OP_ALT)
8610
318k
      {
8611
318k
      firstcu = branchfirstcu;
8612
318k
      firstcuflags = branchfirstcuflags;
8613
318k
      reqcu = branchreqcu;
8614
318k
      reqcuflags = branchreqcuflags;
8615
318k
      }
8616
8617
    /* If this is not the first branch, the first char and reqcu have to
8618
    match the values from all the previous branches, except that if the
8619
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8620
    and we set REQ_VARY for the group from this branch's value. */
8621
8622
710k
    else
8623
710k
      {
8624
      /* If we previously had a firstcu, but it doesn't match the new branch,
8625
      we have to abandon the firstcu for the regex, but if there was
8626
      previously no reqcu, it takes on the value of the old firstcu. */
8627
8628
710k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8629
557k
        {
8630
557k
        if (firstcuflags < REQ_NONE)
8631
32.0k
          {
8632
32.0k
          if (reqcuflags >= REQ_NONE)
8633
4.53k
            {
8634
4.53k
            reqcu = firstcu;
8635
4.53k
            reqcuflags = firstcuflags;
8636
4.53k
            }
8637
32.0k
          }
8638
557k
        firstcuflags = REQ_NONE;
8639
557k
        }
8640
8641
      /* If we (now or from before) have no firstcu, a firstcu from the
8642
      branch becomes a reqcu if there isn't a branch reqcu. */
8643
8644
710k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8645
710k
          branchreqcuflags >= REQ_NONE)
8646
26.2k
        {
8647
26.2k
        branchreqcu = branchfirstcu;
8648
26.2k
        branchreqcuflags = branchfirstcuflags;
8649
26.2k
        }
8650
8651
      /* Now ensure that the reqcus match */
8652
8653
710k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8654
710k
          reqcu != branchreqcu)
8655
673k
        reqcuflags = REQ_NONE;
8656
36.9k
      else
8657
36.9k
        {
8658
36.9k
        reqcu = branchreqcu;
8659
36.9k
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8660
36.9k
        }
8661
710k
      }
8662
1.02M
    }
8663
8664
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8665
  In the real compile phase, go back through the alternative branches and
8666
  reverse the chain of offsets, with the field in the BRA item now becoming an
8667
  offset to the first alternative. If there are no alternatives, it points to
8668
  the end of the group. The length in the terminating ket is always the length
8669
  of the whole bracketed item. Return leaving the pointer at the terminating
8670
  char. */
8671
8672
2.09M
  if (META_CODE(*pptr) != META_ALT)
8673
640k
    {
8674
640k
    if (lengthptr == NULL)
8675
318k
      {
8676
318k
      uint32_t branch_length = (uint32_t)(code - last_branch);
8677
318k
      do
8678
1.02M
        {
8679
1.02M
        uint32_t prev_length = GET(last_branch, 1);
8680
1.02M
        PUT(last_branch, 1, branch_length);
8681
1.02M
        branch_length = prev_length;
8682
1.02M
        last_branch -= branch_length;
8683
1.02M
        }
8684
1.02M
      while (branch_length > 0);
8685
318k
      }
8686
8687
    /* Fill in the ket */
8688
8689
640k
    *code = OP_KET;
8690
640k
    PUT(code, 1, (uint32_t)(code - start_bracket));
8691
640k
    code += 1 + LINK_SIZE;
8692
8693
    /* Set values to pass back */
8694
8695
640k
    *codeptr = code;
8696
640k
    *pptrptr = pptr;
8697
640k
    *firstcuptr = firstcu;
8698
640k
    *firstcuflagsptr = firstcuflags;
8699
640k
    *reqcuptr = reqcu;
8700
640k
    *reqcuflagsptr = reqcuflags;
8701
640k
    if (lengthptr != NULL)
8702
322k
      {
8703
322k
      if (OFLOW_MAX - *lengthptr < length)
8704
0
        {
8705
0
        *errorcodeptr = ERR20;
8706
0
        return 0;
8707
0
        }
8708
322k
      *lengthptr += length;
8709
322k
      }
8710
640k
    return okreturn;
8711
640k
    }
8712
8713
  /* Another branch follows. In the pre-compile phase, we can move the code
8714
  pointer back to where it was for the start of the first branch. (That is,
8715
  pretend that each branch is the only one.)
8716
8717
  In the real compile phase, insert an ALT node. Its length field points back
8718
  to the previous branch while the bracket remains open. At the end the chain
8719
  is reversed. It's done like this so that the start of the bracket has a
8720
  zero offset until it is closed, making it possible to detect recursion. */
8721
8722
1.45M
  if (lengthptr != NULL)
8723
742k
    {
8724
742k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8725
742k
    length += 1 + LINK_SIZE;
8726
742k
    }
8727
710k
  else
8728
710k
    {
8729
710k
    *code = OP_ALT;
8730
710k
    PUT(code, 1, (int)(code - last_branch));
8731
710k
    bc.current_branch = last_branch = code;
8732
710k
    code += 1 + LINK_SIZE;
8733
710k
    }
8734
8735
  /* Set the maximum lookbehind length for the next branch (if not in a
8736
  lookbehind the value will be zero) and then advance past the vertical bar. */
8737
8738
1.45M
  lookbehindlength = META_DATA(*pptr);
8739
1.45M
  pptr++;
8740
1.45M
  }
8741
8742
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8743
0
return 0;                  /* Avoid compiler warnings */
8744
643k
}
8745
8746
8747
8748
/*************************************************
8749
*          Check for anchored pattern            *
8750
*************************************************/
8751
8752
/* Try to find out if this is an anchored regular expression. Consider each
8753
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8754
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8755
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8756
be found, because ^ generates OP_CIRCM in that mode.
8757
8758
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8759
This is the code for \G, which means "match at start of match position, taking
8760
into account the match offset".
8761
8762
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8763
because that will try the rest of the pattern at all possible matching points,
8764
so there is no point trying again.... er ....
8765
8766
.... except when the .* appears inside capturing parentheses, and there is a
8767
subsequent back reference to those parentheses. We haven't enough information
8768
to catch that case precisely.
8769
8770
At first, the best we could do was to detect when .* was in capturing brackets
8771
and the highest back reference was greater than or equal to that level.
8772
However, by keeping a bitmap of the first 31 back references, we can catch some
8773
of the more common cases more precisely.
8774
8775
... A second exception is when the .* appears inside an atomic group, because
8776
this prevents the number of characters it matches from being adjusted.
8777
8778
Arguments:
8779
  code           points to start of the compiled pattern
8780
  bracket_map    a bitmap of which brackets we are inside while testing; this
8781
                   handles up to substring 31; after that we just have to take
8782
                   the less precise approach
8783
  cb             points to the compile data block
8784
  atomcount      atomic group level
8785
  inassert       TRUE if in an assertion
8786
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8787
8788
Returns:     TRUE or FALSE
8789
*/
8790
8791
static BOOL
8792
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8793
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8794
57.1k
{
8795
59.0k
do {
8796
59.0k
   PCRE2_SPTR scode = first_significant_code(
8797
59.0k
     code + PRIV(OP_lengths)[*code], FALSE);
8798
59.0k
   int op = *scode;
8799
8800
   /* Non-capturing brackets */
8801
8802
59.0k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8803
59.0k
       op == OP_SBRA || op == OP_SBRAPOS)
8804
1.10k
     {
8805
1.10k
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8806
845
       return FALSE;
8807
1.10k
     }
8808
8809
   /* Capturing brackets */
8810
8811
57.9k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8812
57.9k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8813
3.57k
     {
8814
3.57k
     int n = GET2(scode, 1+LINK_SIZE);
8815
3.57k
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8816
3.57k
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8817
3.57k
     }
8818
8819
   /* Positive forward assertion */
8820
8821
54.4k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8822
602
     {
8823
602
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8824
602
     }
8825
8826
   /* Condition. If there is no second branch, it can't be anchored. */
8827
8828
53.8k
   else if (op == OP_COND || op == OP_SCOND)
8829
290
     {
8830
290
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8831
108
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8832
93
       return FALSE;
8833
108
     }
8834
8835
   /* Atomic groups */
8836
8837
53.5k
   else if (op == OP_ONCE)
8838
231
     {
8839
231
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8840
168
       return FALSE;
8841
231
     }
8842
8843
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8844
   it isn't in brackets that are or may be referenced or inside an atomic
8845
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8846
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8847
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8848
   There is also an option that disables auto-anchoring. */
8849
8850
53.2k
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8851
53.2k
             op == OP_TYPEPOSSTAR))
8852
4.39k
     {
8853
4.39k
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8854
4.39k
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8855
3.07k
       return FALSE;
8856
4.39k
     }
8857
8858
   /* Check for explicit anchoring */
8859
8860
48.8k
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8861
8862
3.42k
   code += GET(code, 1);
8863
3.42k
   }
8864
57.1k
while (*code == OP_ALT);   /* Loop for each alternative */
8865
1.50k
return TRUE;
8866
57.1k
}
8867
8868
8869
8870
/*************************************************
8871
*         Check for starting with ^ or .*        *
8872
*************************************************/
8873
8874
/* This is called to find out if every branch starts with ^ or .* so that
8875
"first char" processing can be done to speed things up in multiline
8876
matching and for non-DOTALL patterns that start with .* (which must start at
8877
the beginning or after \n). As in the case of is_anchored() (see above), we
8878
have to take account of back references to capturing brackets that contain .*
8879
because in that case we can't make the assumption. Also, the appearance of .*
8880
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8881
or *SKIP does not count, because once again the assumption no longer holds.
8882
8883
Arguments:
8884
  code           points to start of the compiled pattern or a group
8885
  bracket_map    a bitmap of which brackets we are inside while testing; this
8886
                   handles up to substring 31; after that we just have to take
8887
                   the less precise approach
8888
  cb             points to the compile data
8889
  atomcount      atomic group level
8890
  inassert       TRUE if in an assertion
8891
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8892
8893
Returns:         TRUE or FALSE
8894
*/
8895
8896
static BOOL
8897
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8898
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8899
34.8k
{
8900
37.6k
do {
8901
37.6k
   PCRE2_SPTR scode = first_significant_code(
8902
37.6k
     code + PRIV(OP_lengths)[*code], FALSE);
8903
37.6k
   int op = *scode;
8904
8905
   /* If we are at the start of a conditional assertion group, *both* the
8906
   conditional assertion *and* what follows the condition must satisfy the test
8907
   for start of line. Other kinds of condition fail. Note that there may be an
8908
   auto-callout at the start of a condition. */
8909
8910
37.6k
   if (op == OP_COND)
8911
207
     {
8912
207
     scode += 1 + LINK_SIZE;
8913
8914
207
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
8915
181
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
8916
8917
207
     switch (*scode)
8918
207
       {
8919
12
       case OP_CREF:
8920
20
       case OP_DNCREF:
8921
39
       case OP_RREF:
8922
44
       case OP_DNRREF:
8923
68
       case OP_FAIL:
8924
73
       case OP_FALSE:
8925
79
       case OP_TRUE:
8926
79
       return FALSE;
8927
8928
128
       default:     /* Assertion */
8929
128
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8930
113
         return FALSE;
8931
22
       do scode += GET(scode, 1); while (*scode == OP_ALT);
8932
15
       scode += 1 + LINK_SIZE;
8933
15
       break;
8934
207
       }
8935
15
     scode = first_significant_code(scode, FALSE);
8936
15
     op = *scode;
8937
15
     }
8938
8939
   /* Non-capturing brackets */
8940
8941
37.4k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8942
37.4k
       op == OP_SBRA || op == OP_SBRAPOS)
8943
694
     {
8944
694
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8945
496
       return FALSE;
8946
694
     }
8947
8948
   /* Capturing brackets */
8949
8950
36.7k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8951
36.7k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8952
2.83k
     {
8953
2.83k
     int n = GET2(scode, 1+LINK_SIZE);
8954
2.83k
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8955
2.83k
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
8956
1.73k
       return FALSE;
8957
2.83k
     }
8958
8959
   /* Positive forward assertions */
8960
8961
33.8k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8962
357
     {
8963
357
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
8964
282
       return FALSE;
8965
357
     }
8966
8967
   /* Atomic brackets */
8968
8969
33.5k
   else if (op == OP_ONCE)
8970
177
     {
8971
177
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8972
125
       return FALSE;
8973
177
     }
8974
8975
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
8976
   brackets that may be referenced or an assertion, and as long as the pattern
8977
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
8978
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
8979
   i.e. not at the start of a line. There is also an option that disables this
8980
   optimization. */
8981
8982
33.3k
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
8983
4.40k
     {
8984
4.40k
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
8985
4.40k
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8986
1.61k
       return FALSE;
8987
4.40k
     }
8988
8989
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
8990
   in particular that this includes atomic brackets OP_ONCE because the number
8991
   of characters matched by .* cannot be adjusted inside them. */
8992
8993
28.9k
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
8994
8995
   /* Move on to the next alternative */
8996
8997
4.94k
   code += GET(code, 1);
8998
4.94k
   }
8999
34.8k
while (*code == OP_ALT);  /* Loop for each alternative */
9000
2.20k
return TRUE;
9001
34.8k
}
9002
9003
9004
9005
/*************************************************
9006
*   Scan compiled regex for recursion reference  *
9007
*************************************************/
9008
9009
/* This function scans through a compiled pattern until it finds an instance of
9010
OP_RECURSE.
9011
9012
Arguments:
9013
  code        points to start of expression
9014
  utf         TRUE in UTF mode
9015
9016
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
9017
*/
9018
9019
static PCRE2_UCHAR *
9020
find_recurse(PCRE2_UCHAR *code, BOOL utf)
9021
485k
{
9022
485k
for (;;)
9023
158M
  {
9024
158M
  PCRE2_UCHAR c = *code;
9025
158M
  if (c == OP_END) return NULL;
9026
158M
  if (c == OP_RECURSE) return code;
9027
9028
  /* XCLASS is used for classes that cannot be represented just by a bit map.
9029
  This includes negated single high-valued characters. ECLASS is used for
9030
  classes that use set operations internally. CALLOUT_STR is used for
9031
  callouts with string arguments. In each case the length in the table is
9032
  zero; the actual length is stored in the compiled code. */
9033
9034
158M
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
9035
158M
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
9036
9037
  /* Otherwise, we can get the item's length from the table, except that for
9038
  repeated character types, we have to test for \p and \P, which have an extra
9039
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
9040
  we must add in its length. */
9041
9042
158M
  else
9043
158M
    {
9044
158M
    switch(c)
9045
158M
      {
9046
700k
      case OP_TYPESTAR:
9047
832k
      case OP_TYPEMINSTAR:
9048
1.41M
      case OP_TYPEPLUS:
9049
1.53M
      case OP_TYPEMINPLUS:
9050
1.80M
      case OP_TYPEQUERY:
9051
1.97M
      case OP_TYPEMINQUERY:
9052
1.98M
      case OP_TYPEPOSSTAR:
9053
1.99M
      case OP_TYPEPOSPLUS:
9054
2.01M
      case OP_TYPEPOSQUERY:
9055
2.01M
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
9056
2.01M
      break;
9057
9058
344
      case OP_TYPEPOSUPTO:
9059
103k
      case OP_TYPEUPTO:
9060
122k
      case OP_TYPEMINUPTO:
9061
189k
      case OP_TYPEEXACT:
9062
189k
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
9063
51.4k
        code += 2;
9064
189k
      break;
9065
9066
55.2k
      case OP_MARK:
9067
58.2k
      case OP_COMMIT_ARG:
9068
61.0k
      case OP_PRUNE_ARG:
9069
117k
      case OP_SKIP_ARG:
9070
179k
      case OP_THEN_ARG:
9071
179k
      code += code[1];
9072
179k
      break;
9073
158M
      }
9074
9075
    /* Add in the fixed length from the table */
9076
9077
158M
    code += PRIV(OP_lengths)[c];
9078
9079
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
9080
    be followed by a multi-unit character. The length in the table is a
9081
    minimum, so we have to arrange to skip the extra units. */
9082
9083
158M
#ifdef MAYBE_UTF_MULTI
9084
158M
    if (utf) switch(c)
9085
1.79M
      {
9086
239k
      case OP_CHAR:
9087
921k
      case OP_CHARI:
9088
929k
      case OP_NOT:
9089
940k
      case OP_NOTI:
9090
941k
      case OP_EXACT:
9091
942k
      case OP_EXACTI:
9092
946k
      case OP_NOTEXACT:
9093
953k
      case OP_NOTEXACTI:
9094
954k
      case OP_UPTO:
9095
954k
      case OP_UPTOI:
9096
954k
      case OP_NOTUPTO:
9097
954k
      case OP_NOTUPTOI:
9098
955k
      case OP_MINUPTO:
9099
955k
      case OP_MINUPTOI:
9100
956k
      case OP_NOTMINUPTO:
9101
956k
      case OP_NOTMINUPTOI:
9102
964k
      case OP_POSUPTO:
9103
977k
      case OP_POSUPTOI:
9104
978k
      case OP_NOTPOSUPTO:
9105
978k
      case OP_NOTPOSUPTOI:
9106
984k
      case OP_STAR:
9107
1.00M
      case OP_STARI:
9108
1.00M
      case OP_NOTSTAR:
9109
1.00M
      case OP_NOTSTARI:
9110
1.01M
      case OP_MINSTAR:
9111
1.01M
      case OP_MINSTARI:
9112
1.01M
      case OP_NOTMINSTAR:
9113
1.01M
      case OP_NOTMINSTARI:
9114
1.01M
      case OP_POSSTAR:
9115
1.01M
      case OP_POSSTARI:
9116
1.02M
      case OP_NOTPOSSTAR:
9117
1.02M
      case OP_NOTPOSSTARI:
9118
1.03M
      case OP_PLUS:
9119
1.05M
      case OP_PLUSI:
9120
1.05M
      case OP_NOTPLUS:
9121
1.05M
      case OP_NOTPLUSI:
9122
1.06M
      case OP_MINPLUS:
9123
1.07M
      case OP_MINPLUSI:
9124
1.07M
      case OP_NOTMINPLUS:
9125
1.07M
      case OP_NOTMINPLUSI:
9126
1.07M
      case OP_POSPLUS:
9127
1.07M
      case OP_POSPLUSI:
9128
1.07M
      case OP_NOTPOSPLUS:
9129
1.07M
      case OP_NOTPOSPLUSI:
9130
1.07M
      case OP_QUERY:
9131
1.07M
      case OP_QUERYI:
9132
1.07M
      case OP_NOTQUERY:
9133
1.07M
      case OP_NOTQUERYI:
9134
1.07M
      case OP_MINQUERY:
9135
1.07M
      case OP_MINQUERYI:
9136
1.07M
      case OP_NOTMINQUERY:
9137
1.07M
      case OP_NOTMINQUERYI:
9138
1.07M
      case OP_POSQUERY:
9139
1.07M
      case OP_POSQUERYI:
9140
1.07M
      case OP_NOTPOSQUERY:
9141
1.07M
      case OP_NOTPOSQUERYI:
9142
1.07M
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9143
1.07M
      break;
9144
1.79M
      }
9145
#else
9146
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9147
#endif  /* MAYBE_UTF_MULTI */
9148
158M
    }
9149
158M
  }
9150
485k
}
9151
9152
9153
9154
/*************************************************
9155
*    Check for asserted fixed first code unit    *
9156
*************************************************/
9157
9158
/* During compilation, the "first code unit" settings from forward assertions
9159
are discarded, because they can cause conflicts with actual literals that
9160
follow. However, if we end up without a first code unit setting for an
9161
unanchored pattern, it is worth scanning the regex to see if there is an
9162
initial asserted first code unit. If all branches start with the same asserted
9163
code unit, or with a non-conditional bracket all of whose alternatives start
9164
with the same asserted code unit (recurse ad lib), then we return that code
9165
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9166
REQ_NONE in the flags.
9167
9168
Arguments:
9169
  code       points to start of compiled pattern
9170
  flags      points to the first code unit flags
9171
  inassert   non-zero if in an assertion
9172
9173
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9174
*/
9175
9176
static uint32_t
9177
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9178
35.3k
{
9179
35.3k
uint32_t c = 0;
9180
35.3k
uint32_t cflags = REQ_NONE;
9181
9182
35.3k
*flags = REQ_NONE;
9183
40.4k
do {
9184
40.4k
   uint32_t d;
9185
40.4k
   uint32_t dflags;
9186
40.4k
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9187
40.4k
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9188
40.4k
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9189
40.4k
   PCRE2_UCHAR op = *scode;
9190
9191
40.4k
   switch(op)
9192
40.4k
     {
9193
19.8k
     default:
9194
19.8k
     return 0;
9195
9196
511
     case OP_BRA:
9197
662
     case OP_BRAPOS:
9198
2.53k
     case OP_CBRA:
9199
2.56k
     case OP_SCBRA:
9200
2.97k
     case OP_CBRAPOS:
9201
3.10k
     case OP_SCBRAPOS:
9202
3.45k
     case OP_ASSERT:
9203
3.52k
     case OP_ASSERT_NA:
9204
3.62k
     case OP_ONCE:
9205
3.68k
     case OP_SCRIPT_RUN:
9206
3.68k
     d = find_firstassertedcu(scode, &dflags, inassert +
9207
3.68k
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9208
3.68k
     if (dflags >= REQ_NONE) return 0;
9209
774
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9210
151
       else if (c != d || cflags != dflags) return 0;
9211
741
     break;
9212
9213
741
     case OP_EXACT:
9214
418
     scode += IMM2_SIZE;
9215
     /* Fall through */
9216
9217
11.1k
     case OP_CHAR:
9218
11.6k
     case OP_PLUS:
9219
11.7k
     case OP_MINPLUS:
9220
12.6k
     case OP_POSPLUS:
9221
12.6k
     if (inassert == 0) return 0;
9222
3.41k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9223
3.05k
       else if (c != scode[1]) return 0;
9224
3.35k
     break;
9225
9226
3.35k
     case OP_EXACTI:
9227
122
     scode += IMM2_SIZE;
9228
     /* Fall through */
9229
9230
3.30k
     case OP_CHARI:
9231
3.48k
     case OP_PLUSI:
9232
4.09k
     case OP_MINPLUSI:
9233
4.22k
     case OP_POSPLUSI:
9234
4.22k
     if (inassert == 0) return 0;
9235
9236
     /* If the character is more than one code unit long, we cannot set its
9237
     first code unit when matching caselessly. Later scanning may pick up
9238
     multiple code units. */
9239
9240
1.89k
#ifdef SUPPORT_UNICODE
9241
1.89k
#if PCRE2_CODE_UNIT_WIDTH == 8
9242
1.89k
     if (scode[1] >= 0x80) return 0;
9243
#elif PCRE2_CODE_UNIT_WIDTH == 16
9244
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9245
#endif
9246
1.87k
#endif
9247
9248
1.87k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9249
1.78k
       else if (c != scode[1]) return 0;
9250
1.85k
     break;
9251
40.4k
     }
9252
9253
5.94k
   code += GET(code, 1);
9254
5.94k
   }
9255
35.3k
while (*code == OP_ALT);
9256
9257
864
*flags = cflags;
9258
864
return c;
9259
35.3k
}
9260
9261
9262
9263
/*************************************************
9264
*             Skip in parsed pattern             *
9265
*************************************************/
9266
9267
/* This function is called to skip parts of the parsed pattern when finding the
9268
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9269
the end of the branch, it is called to skip over an internal lookaround or
9270
(DEFINE) group, and it is also called to skip to the end of a class, during
9271
which it will never encounter nested groups (but there's no need to have
9272
special code for that).
9273
9274
When called to find the end of a branch or group, pptr must point to the first
9275
meta code inside the branch, not the branch-starting code. In other cases it
9276
can point to the item that causes the function to be called.
9277
9278
Arguments:
9279
  pptr       current pointer to skip from
9280
  skiptype   PSKIP_CLASS when skipping to end of class
9281
             PSKIP_ALT when META_ALT ends the skip
9282
             PSKIP_KET when only META_KET ends the skip
9283
9284
Returns:     new value of pptr
9285
             NULL if META_END is reached - should never occur
9286
               or for an unknown meta value - likewise
9287
*/
9288
9289
static uint32_t *
9290
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9291
105k
{
9292
105k
uint32_t nestlevel = 0;
9293
9294
131M
for (;; pptr++)
9295
132M
  {
9296
132M
  uint32_t meta = META_CODE(*pptr);
9297
9298
132M
  switch(meta)
9299
132M
    {
9300
118M
    default:  /* Just skip over most items */
9301
118M
    if (meta < META_END) continue;  /* Literal */
9302
39.5M
    break;
9303
9304
39.5M
    case META_END:
9305
9306
    /* The parsed regex is malformed; we have reached the end and did
9307
    not find the end of the construct which we are skipping over. */
9308
9309
0
    PCRE2_DEBUG_UNREACHABLE();
9310
0
    return NULL;
9311
9312
    /* The data for these items is variable in length. */
9313
9314
33.9k
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9315
33.9k
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9316
33.9k
    break;
9317
9318
5.46M
    case META_ESCAPE:
9319
5.46M
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9320
1.80M
      pptr += 1;     /* Skip prop data */
9321
5.46M
    break;
9322
9323
1.11k
    case META_MARK:     /* Add the length of the name. */
9324
1.96k
    case META_COMMIT_ARG:
9325
4.98k
    case META_PRUNE_ARG:
9326
6.48k
    case META_SKIP_ARG:
9327
21.4k
    case META_THEN_ARG:
9328
21.4k
    pptr += pptr[1];
9329
21.4k
    break;
9330
9331
    /* These are the "active" items in this loop. */
9332
9333
264k
    case META_CLASS_END:
9334
264k
    if (skiptype == PSKIP_CLASS) return pptr;
9335
251k
    break;
9336
9337
251k
    case META_ATOMIC:
9338
1.90M
    case META_CAPTURE:
9339
1.90M
    case META_COND_ASSERT:
9340
1.90M
    case META_COND_DEFINE:
9341
1.90M
    case META_COND_NAME:
9342
1.90M
    case META_COND_NUMBER:
9343
1.90M
    case META_COND_RNAME:
9344
1.91M
    case META_COND_RNUMBER:
9345
1.91M
    case META_COND_VERSION:
9346
1.91M
    case META_SCS:
9347
2.05M
    case META_LOOKAHEAD:
9348
2.09M
    case META_LOOKAHEADNOT:
9349
2.09M
    case META_LOOKAHEAD_NA:
9350
2.10M
    case META_LOOKBEHIND:
9351
2.18M
    case META_LOOKBEHINDNOT:
9352
2.18M
    case META_LOOKBEHIND_NA:
9353
2.37M
    case META_NOCAPTURE:
9354
2.37M
    case META_SCRIPT_RUN:
9355
2.37M
    nestlevel++;
9356
2.37M
    break;
9357
9358
2.51M
    case META_ALT:
9359
2.51M
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9360
2.51M
    break;
9361
9362
2.51M
    case META_KET:
9363
2.46M
    if (nestlevel == 0) return pptr;
9364
2.37M
    nestlevel--;
9365
2.37M
    break;
9366
132M
    }
9367
9368
  /* The extra data item length for each meta is in a table. */
9369
9370
52.5M
  meta = (meta >> 16) & 0x7fff;
9371
52.5M
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9372
52.5M
  pptr += meta_extra_lengths[meta];
9373
52.5M
  }
9374
9375
105k
PCRE2_UNREACHABLE(); /* Control never reaches here */
9376
105k
}
9377
9378
9379
9380
/*************************************************
9381
*       Find length of a parsed group            *
9382
*************************************************/
9383
9384
/* This is called for nested groups within a branch of a lookbehind whose
9385
length is being computed. On entry, the pointer must be at the first element
9386
after the group initializing code. On exit it points to OP_KET. Caching is used
9387
to improve processing speed when the same capturing group occurs many times.
9388
9389
Arguments:
9390
  pptrptr     pointer to pointer in the parsed pattern
9391
  minptr      where to return the minimum length
9392
  isinline    FALSE if a reference or recursion; TRUE for inline group
9393
  errcodeptr  pointer to the errorcode
9394
  lcptr       pointer to the loop counter
9395
  group       number of captured group or -1 for a non-capturing group
9396
  recurses    chain of recurse_check to catch mutual recursion
9397
  cb          pointer to the compile data
9398
9399
Returns:      the maximum group length or a negative number
9400
*/
9401
9402
static int
9403
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9404
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9405
256k
{
9406
256k
uint32_t *gi = cb->groupinfo + 2 * group;
9407
256k
int branchlength, branchminlength;
9408
256k
int grouplength = -1;
9409
256k
int groupminlength = INT_MAX;
9410
9411
/* The cache can be used only if there is no possibility of there being two
9412
groups with the same number. We do not need to set the end pointer for a group
9413
that is being processed as a back reference or recursion, but we must do so for
9414
an inline group. */
9415
9416
256k
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9417
9.76k
  {
9418
9.76k
  uint32_t groupinfo = gi[0];
9419
9.76k
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9420
9.76k
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9421
1.51k
    {
9422
1.51k
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9423
1.51k
    *minptr = gi[1];
9424
1.51k
    return groupinfo & GI_FIXED_LENGTH_MASK;
9425
1.51k
    }
9426
9.76k
  }
9427
9428
/* Scan the group. In this case we find the end pointer of necessity. */
9429
9430
255k
for(;;)
9431
270k
  {
9432
270k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9433
270k
    recurses, cb);
9434
270k
  if (branchlength < 0) goto ISNOTFIXED;
9435
265k
  if (branchlength > grouplength) grouplength = branchlength;
9436
265k
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9437
265k
  if (**pptrptr == META_KET) break;
9438
15.7k
  *pptrptr += 1;   /* Skip META_ALT */
9439
15.7k
  }
9440
9441
249k
if (group > 0)
9442
219k
  {
9443
219k
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9444
219k
  gi[1] = groupminlength;
9445
219k
  }
9446
9447
249k
*minptr = groupminlength;
9448
249k
return grouplength;
9449
9450
5.34k
ISNOTFIXED:
9451
5.34k
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9452
5.34k
return -1;
9453
255k
}
9454
9455
9456
9457
/*************************************************
9458
*        Find length of a parsed branch          *
9459
*************************************************/
9460
9461
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9462
giving an error if the length is not limited. On entry, *pptrptr points to the
9463
first element inside the branch. On exit it is set to point to the ALT or KET.
9464
9465
Arguments:
9466
  pptrptr     pointer to pointer in the parsed pattern
9467
  minptr      where to return the minimum length
9468
  errcodeptr  pointer to error code
9469
  lcptr       pointer to loop counter
9470
  recurses    chain of recurse_check to catch mutual recursion
9471
  cb          pointer to compile block
9472
9473
Returns:      the maximum length, or a negative value on error
9474
*/
9475
9476
static int
9477
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9478
  parsed_recurse_check *recurses, compile_block *cb)
9479
318k
{
9480
318k
int branchlength = 0;
9481
318k
int branchminlength = 0;
9482
318k
int grouplength, groupminlength;
9483
318k
uint32_t lastitemlength = 0;
9484
318k
uint32_t lastitemminlength = 0;
9485
318k
uint32_t *pptr = *pptrptr;
9486
318k
PCRE2_SIZE offset;
9487
318k
parsed_recurse_check this_recurse;
9488
9489
/* A large and/or complex regex can take too long to process. This can happen
9490
more often when (?| groups are present in the pattern because their length
9491
cannot be cached. */
9492
9493
318k
if ((*lcptr)++ > 2000)
9494
30
  {
9495
30
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9496
30
  return -1;
9497
30
  }
9498
9499
/* Scan the branch, accumulating the length. */
9500
9501
2.64M
for (;; pptr++)
9502
2.96M
  {
9503
2.96M
  parsed_recurse_check *r;
9504
2.96M
  uint32_t *gptr, *gptrend;
9505
2.96M
  uint32_t escape;
9506
2.96M
  uint32_t min, max;
9507
2.96M
  uint32_t group = 0;
9508
2.96M
  uint32_t itemlength = 0;
9509
2.96M
  uint32_t itemminlength = 0;
9510
9511
2.96M
  if (*pptr < META_END)
9512
1.64M
    {
9513
1.64M
    itemlength = itemminlength = 1;
9514
1.64M
    }
9515
9516
1.31M
  else switch (META_CODE(*pptr))
9517
1.31M
    {
9518
273k
    case META_KET:
9519
302k
    case META_ALT:
9520
302k
    goto EXIT;
9521
9522
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9523
    actual termination. */
9524
9525
9.21k
    case META_ACCEPT:
9526
9.90k
    case META_FAIL:
9527
9.90k
    pptr = parsed_skip(pptr, PSKIP_ALT);
9528
9.90k
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9529
9.90k
    goto EXIT;
9530
9531
9.90k
    case META_MARK:
9532
946
    case META_COMMIT_ARG:
9533
2.07k
    case META_PRUNE_ARG:
9534
2.73k
    case META_SKIP_ARG:
9535
7.93k
    case META_THEN_ARG:
9536
7.93k
    pptr += pptr[1] + 1;
9537
7.93k
    break;
9538
9539
4.77k
    case META_CIRCUMFLEX:
9540
5.57k
    case META_COMMIT:
9541
9.67k
    case META_DOLLAR:
9542
10.0k
    case META_PRUNE:
9543
15.1k
    case META_SKIP:
9544
15.8k
    case META_THEN:
9545
15.8k
    break;
9546
9547
301
    case META_OPTIONS:
9548
301
    pptr += 2;
9549
301
    break;
9550
9551
0
    case META_BIGVALUE:
9552
0
    itemlength = itemminlength = 1;
9553
0
    pptr += 1;
9554
0
    break;
9555
9556
2.11k
    case META_CLASS:
9557
12.9k
    case META_CLASS_NOT:
9558
12.9k
    itemlength = itemminlength = 1;
9559
12.9k
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9560
12.9k
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9561
12.9k
    break;
9562
9563
12.9k
    case META_CLASS_EMPTY_NOT:
9564
9.55k
    case META_DOT:
9565
9.55k
    itemlength = itemminlength = 1;
9566
9.55k
    break;
9567
9568
575k
    case META_CALLOUT_NUMBER:
9569
575k
    pptr += 3;
9570
575k
    break;
9571
9572
1.78k
    case META_CALLOUT_STRING:
9573
1.78k
    pptr += 3 + SIZEOFFSET;
9574
1.78k
    break;
9575
9576
    /* Only some escapes consume a character. Of those, \R can match one or two
9577
    characters, but \X is never allowed because it matches an unknown number of
9578
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9579
9580
20.3k
    case META_ESCAPE:
9581
20.3k
    escape = META_DATA(*pptr);
9582
20.3k
    if (escape == ESC_X) return -1;
9583
20.2k
    if (escape == ESC_R)
9584
704
      {
9585
704
      itemminlength = 1;
9586
704
      itemlength = 2;
9587
704
      }
9588
19.5k
    else if (escape > ESC_b && escape < ESC_Z)
9589
12.0k
      {
9590
12.0k
#if PCRE2_CODE_UNIT_WIDTH != 32
9591
12.0k
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9592
0
        {
9593
0
        *errcodeptr = ERR36;
9594
0
        return -1;
9595
0
        }
9596
12.0k
#endif
9597
12.0k
      itemlength = itemminlength = 1;
9598
12.0k
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9599
12.0k
      }
9600
20.2k
    break;
9601
9602
    /* Lookaheads do not contribute to the length of this branch, but they may
9603
    contain lookbehinds within them whose lengths need to be set. */
9604
9605
20.2k
    case META_LOOKAHEAD:
9606
29.7k
    case META_LOOKAHEADNOT:
9607
30.3k
    case META_LOOKAHEAD_NA:
9608
30.3k
    case META_SCS:
9609
30.3k
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9610
30.3k
    if (*errcodeptr != 0) return -1;
9611
9612
    /* Ignore any qualifiers that follow a lookahead assertion. */
9613
9614
30.3k
    switch (pptr[1])
9615
30.3k
      {
9616
507
      case META_ASTERISK:
9617
603
      case META_ASTERISK_PLUS:
9618
5.15k
      case META_ASTERISK_QUERY:
9619
5.60k
      case META_PLUS:
9620
5.67k
      case META_PLUS_PLUS:
9621
9.45k
      case META_PLUS_QUERY:
9622
9.81k
      case META_QUERY:
9623
10.3k
      case META_QUERY_PLUS:
9624
10.6k
      case META_QUERY_QUERY:
9625
10.6k
      pptr++;
9626
10.6k
      break;
9627
9628
313
      case META_MINMAX:
9629
810
      case META_MINMAX_PLUS:
9630
932
      case META_MINMAX_QUERY:
9631
932
      pptr += 3;
9632
932
      break;
9633
9634
18.7k
      default:
9635
18.7k
      break;
9636
30.3k
      }
9637
30.3k
    break;
9638
9639
    /* A nested lookbehind does not contribute any length to this lookbehind,
9640
    but must itself be checked and have its lengths set. Note that
9641
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9642
    of the group, so no need to update it here. */
9643
9644
30.3k
    case META_LOOKBEHIND:
9645
4.83k
    case META_LOOKBEHINDNOT:
9646
10.6k
    case META_LOOKBEHIND_NA:
9647
10.6k
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9648
246
      return -1;
9649
10.4k
    break;
9650
9651
    /* Back references and recursions are handled by very similar code. At this
9652
    stage, the names generated in the parsing pass are available, but the main
9653
    name table has not yet been created. So for the named varieties, scan the
9654
    list of names in order to get the number of the first one in the pattern,
9655
    and whether or not this name is duplicated. */
9656
9657
10.4k
    case META_BACKREF_BYNAME:
9658
57
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9659
2
      goto ISNOTFIXED;
9660
    /* Fall through */
9661
9662
75
    case META_RECURSE_BYNAME:
9663
75
      {
9664
75
      PCRE2_SPTR name;
9665
75
      BOOL is_dupname = FALSE;
9666
75
      named_group *ng;
9667
75
      uint32_t meta_code = META_CODE(*pptr);
9668
75
      uint32_t length = *(++pptr);
9669
9670
75
      GETPLUSOFFSET(offset, pptr);
9671
75
      name = cb->start_pattern + offset;
9672
75
      ng = PRIV(compile_find_named_group)(name, length, cb);
9673
9674
75
      if (ng == NULL)
9675
9
        {
9676
9
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9677
9
        cb->erroroffset = offset;
9678
9
        return -1;
9679
9
        }
9680
9681
66
      group = ng->number;
9682
66
      is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;
9683
9684
      /* A numerical back reference can be fixed length if duplicate capturing
9685
      groups are not being used. A non-duplicate named back reference can also
9686
      be handled. */
9687
9688
66
      if (meta_code == META_RECURSE_BYNAME ||
9689
66
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9690
60
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9691
66
      }
9692
6
    goto ISNOTFIXED;                     /* Duplicate name or number */
9693
9694
    /* The offset values for back references < 10 are in a separate vector
9695
    because otherwise they would use more than two parsed pattern elements on
9696
    64-bit systems. */
9697
9698
2.34k
    case META_BACKREF:
9699
2.34k
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9700
2.34k
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9701
16
      goto ISNOTFIXED;
9702
2.32k
    group = META_DATA(*pptr);
9703
2.32k
    if (group < 10)
9704
604
      {
9705
604
      offset = cb->small_ref_offset[group];
9706
604
      goto RECURSE_OR_BACKREF_LENGTH;
9707
604
      }
9708
9709
    /* Fall through */
9710
    /* For groups >= 10 - picking up group twice does no harm. */
9711
9712
    /* A true recursion implies not fixed length, but a subroutine call may
9713
    be OK. Back reference "recursions" are also failed. */
9714
9715
81.9k
    case META_RECURSE:
9716
81.9k
    group = META_DATA(*pptr);
9717
81.9k
    GETPLUSOFFSET(offset, pptr);
9718
9719
82.6k
    RECURSE_OR_BACKREF_LENGTH:
9720
82.6k
    if (group > cb->bracount)
9721
91
      {
9722
91
      cb->erroroffset = offset;
9723
91
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9724
91
      return -1;
9725
91
      }
9726
82.5k
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9727
150M
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9728
150M
      {
9729
150M
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9730
150M
        else if (*gptr == (META_CAPTURE | group)) break;
9731
150M
      }
9732
9733
    /* We must start the search for the end of the group at the first meta code
9734
    inside the group. Otherwise it will be treated as an enclosed group. */
9735
9736
82.5k
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9737
82.5k
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9738
82.5k
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9739
618k
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9740
82.4k
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9741
82.4k
    this_recurse.prev = recurses;
9742
82.4k
    this_recurse.groupptr = gptr;
9743
9744
    /* We do not need to know the position of the end of the group, that is,
9745
    gptr is not used after the call to get_grouplength(). Setting the second
9746
    argument FALSE stops it scanning for the end when the length can be found
9747
    in the cache. */
9748
9749
82.4k
    gptr++;
9750
82.4k
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9751
82.4k
      lcptr, group, &this_recurse, cb);
9752
82.4k
    if (grouplength < 0)
9753
1.98k
      {
9754
1.98k
      if (*errcodeptr == 0) goto ISNOTFIXED;
9755
1.98k
      return -1;  /* Error already set */
9756
1.98k
      }
9757
80.4k
    itemlength = grouplength;
9758
80.4k
    itemminlength = groupminlength;
9759
80.4k
    break;
9760
9761
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9762
    the length of this branch. Skip from the following item to the next
9763
    unpaired ket. */
9764
9765
0
    case META_COND_DEFINE:
9766
0
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9767
0
    break;
9768
9769
    /* Check other nested groups - advance past the initial data for each type
9770
    and then seek a fixed length with get_grouplength(). */
9771
9772
282
    case META_COND_NAME:
9773
1.63k
    case META_COND_NUMBER:
9774
1.75k
    case META_COND_RNAME:
9775
2.71k
    case META_COND_RNUMBER:
9776
2.71k
    pptr += 2 + SIZEOFFSET;
9777
2.71k
    goto CHECK_GROUP;
9778
9779
530
    case META_COND_ASSERT:
9780
530
    pptr += 1;
9781
530
    goto CHECK_GROUP;
9782
9783
102
    case META_COND_VERSION:
9784
102
    pptr += 4;
9785
102
    goto CHECK_GROUP;
9786
9787
143k
    case META_CAPTURE:
9788
143k
    group = META_DATA(*pptr);
9789
    /* Fall through */
9790
9791
145k
    case META_ATOMIC:
9792
170k
    case META_NOCAPTURE:
9793
170k
    case META_SCRIPT_RUN:
9794
170k
    pptr++;
9795
174k
    CHECK_GROUP:
9796
174k
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9797
174k
      lcptr, group, recurses, cb);
9798
174k
    if (grouplength < 0) return -1;
9799
170k
    itemlength = grouplength;
9800
170k
    itemminlength = groupminlength;
9801
170k
    break;
9802
9803
32.9k
    case META_QUERY:
9804
59.1k
    case META_QUERY_PLUS:
9805
59.7k
    case META_QUERY_QUERY:
9806
59.7k
    min = 0;
9807
59.7k
    max = 1;
9808
59.7k
    goto REPETITION;
9809
9810
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9811
    must subtract the length that has already been added. */
9812
9813
1.93k
    case META_MINMAX:
9814
2.32k
    case META_MINMAX_PLUS:
9815
4.17k
    case META_MINMAX_QUERY:
9816
4.17k
    min = pptr[1];
9817
4.17k
    max = pptr[2];
9818
4.17k
    pptr += 2;
9819
9820
63.8k
    REPETITION:
9821
63.8k
    if (max != REPEAT_UNLIMITED)
9822
63.8k
      {
9823
63.8k
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9824
63.8k
          max != 0 &&
9825
63.8k
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9826
3
        {
9827
3
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9828
3
        return -1;
9829
3
        }
9830
63.8k
      if (min == 0) branchminlength -= lastitemminlength;
9831
2.15k
        else itemminlength = (min - 1) * lastitemminlength;
9832
63.8k
      if (max == 0) branchlength -= lastitemlength;
9833
61.9k
        else itemlength = (max - 1) * lastitemlength;
9834
63.8k
      break;
9835
63.8k
      }
9836
    /* Fall through */
9837
9838
    /* Any other item means this branch does not have a fixed length. */
9839
9840
138
    default:
9841
202
    ISNOTFIXED:
9842
202
    *errcodeptr = ERR25;   /* Not fixed length */
9843
202
    return -1;
9844
1.31M
    }
9845
9846
  /* Add the item length to the branchlength, checking for integer overflow and
9847
  for the branch length exceeding the overall limit. Later, if there is at
9848
  least one variable-length branch in the group, there is a test for the
9849
  (smaller) variable-length branch length limit. */
9850
9851
2.64M
  if (INT_MAX - branchlength < (int)itemlength ||
9852
2.64M
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9853
4
    {
9854
4
    *errcodeptr = ERR87;
9855
4
    return -1;
9856
4
    }
9857
9858
2.64M
  branchminlength += itemminlength;
9859
9860
  /* Save this item length for use if the next item is a quantifier. */
9861
9862
2.64M
  lastitemlength = itemlength;
9863
2.64M
  lastitemminlength = itemminlength;
9864
2.64M
  }
9865
9866
312k
EXIT:
9867
312k
*pptrptr = pptr;
9868
312k
*minptr = branchminlength;
9869
312k
return branchlength;
9870
9871
0
PARSED_SKIP_FAILED:
9872
0
PCRE2_DEBUG_UNREACHABLE();
9873
0
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9874
0
return -1;
9875
318k
}
9876
9877
9878
9879
/*************************************************
9880
*        Set lengths in a lookbehind             *
9881
*************************************************/
9882
9883
/* This function is called for each lookbehind, to set the lengths in its
9884
branches. An error occurs if any branch does not have a limited maximum length
9885
that is less than the limit (65535). On exit, the pointer must be left on the
9886
final ket.
9887
9888
The function also maintains the max_lookbehind value. Any lookbehind branch
9889
that contains a nested lookbehind may actually look further back than the
9890
length of the branch. The additional amount is passed back from
9891
get_branchlength() as an "extra" value.
9892
9893
Arguments:
9894
  pptrptr     pointer to pointer in the parsed pattern
9895
  errcodeptr  pointer to error code
9896
  lcptr       pointer to loop counter
9897
  recurses    chain of recurse_check to catch mutual recursion
9898
  cb          pointer to compile block
9899
9900
Returns:      TRUE if all is well
9901
              FALSE otherwise, with error code and offset set
9902
*/
9903
9904
static BOOL
9905
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
9906
  parsed_recurse_check *recurses, compile_block *cb)
9907
34.0k
{
9908
34.0k
PCRE2_SIZE offset;
9909
34.0k
uint32_t *bptr = *pptrptr;
9910
34.0k
uint32_t *gbptr = bptr;
9911
34.0k
int maxlength = 0;
9912
34.0k
int minlength = INT_MAX;
9913
34.0k
BOOL variable = FALSE;
9914
9915
34.0k
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
9916
34.0k
*pptrptr += SIZEOFFSET;
9917
9918
/* Each branch can have a different maximum length, but we can keep only a
9919
single minimum for the whole group, because there's nowhere to save individual
9920
values in the META_ALT item. */
9921
9922
34.0k
do
9923
47.6k
  {
9924
47.6k
  int branchlength, branchminlength;
9925
9926
47.6k
  *pptrptr += 1;
9927
47.6k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9928
47.6k
    recurses, cb);
9929
9930
47.6k
  if (branchlength < 0)
9931
651
    {
9932
    /* The errorcode and offset may already be set from a nested lookbehind. */
9933
651
    if (*errcodeptr == 0) *errcodeptr = ERR25;
9934
651
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
9935
651
    return FALSE;
9936
651
    }
9937
9938
46.9k
  if (branchlength != branchminlength) variable = TRUE;
9939
46.9k
  if (branchminlength < minlength) minlength = branchminlength;
9940
46.9k
  if (branchlength > maxlength) maxlength = branchlength;
9941
46.9k
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
9942
46.9k
  *bptr |= branchlength;  /* branchlength never more than 65535 */
9943
46.9k
  bptr = *pptrptr;
9944
46.9k
  }
9945
46.9k
while (META_CODE(*bptr) == META_ALT);
9946
9947
/* If any branch is of variable length, the whole lookbehind is of variable
9948
length. If the maximum length of any branch exceeds the maximum for variable
9949
lookbehinds, give an error. Otherwise, the minimum length is set in the word
9950
that follows the original group META value. For a fixed-length lookbehind, this
9951
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
9952
possibly different) length. */
9953
9954
33.4k
if (variable)
9955
12.2k
  {
9956
12.2k
  gbptr[1] = minlength;
9957
12.2k
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
9958
65
    {
9959
65
    *errcodeptr = ERR100;
9960
65
    cb->erroroffset = offset;
9961
65
    return FALSE;
9962
65
    }
9963
12.2k
  }
9964
21.1k
else gbptr[1] = LOOKBEHIND_MAX;
9965
9966
33.3k
return TRUE;
9967
33.4k
}
9968
9969
9970
9971
/*************************************************
9972
*         Check parsed pattern lookbehinds       *
9973
*************************************************/
9974
9975
/* This function is called at the end of parsing a pattern if any lookbehinds
9976
were encountered. It scans the parsed pattern for them, calling
9977
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
9978
the error offset is marked unset. The enables the functions above not to
9979
override settings from deeper nestings.
9980
9981
This function is called recursively from get_branchlength() for lookaheads in
9982
order to process any lookbehinds that they may contain. It stops when it hits a
9983
non-nested closing parenthesis in this case, returning a pointer to it.
9984
9985
Arguments
9986
  pptr      points to where to start (start of pattern or start of lookahead)
9987
  retptr    if not NULL, return the ket pointer here
9988
  recurses  chain of recurse_check to catch mutual recursion
9989
  cb        points to the compile block
9990
  lcptr     points to loop counter
9991
9992
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
9993
*/
9994
9995
static int
9996
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
9997
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
9998
35.7k
{
9999
35.7k
int errorcode = 0;
10000
35.7k
int nestlevel = 0;
10001
10002
35.7k
cb->erroroffset = PCRE2_UNSET;
10003
10004
14.2M
for (; *pptr != META_END; pptr++)
10005
14.2M
  {
10006
14.2M
  if (*pptr < META_END) continue;  /* Literal */
10007
10008
4.07M
  switch (META_CODE(*pptr))
10009
4.07M
    {
10010
4
    default:
10011
10012
    /* The following erroroffset is a bogus but safe value. This branch should
10013
    be avoided by providing a proper implementation for all supported cases
10014
    below. */
10015
10016
4
    PCRE2_DEBUG_UNREACHABLE();
10017
4
    cb->erroroffset = 0;
10018
4
    return ERR70;  /* Unrecognized meta code */
10019
10020
280k
    case META_ESCAPE:
10021
280k
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
10022
65.0k
      pptr += 1;    /* Skip prop data */
10023
280k
    break;
10024
10025
299k
    case META_KET:
10026
299k
    if (--nestlevel < 0)
10027
30.3k
      {
10028
30.3k
      if (retptr != NULL) *retptr = pptr;
10029
30.3k
      return 0;
10030
30.3k
      }
10031
269k
    break;
10032
10033
269k
    case META_ATOMIC:
10034
209k
    case META_CAPTURE:
10035
215k
    case META_COND_ASSERT:
10036
215k
    case META_SCS:
10037
234k
    case META_LOOKAHEAD:
10038
243k
    case META_LOOKAHEADNOT:
10039
244k
    case META_LOOKAHEAD_NA:
10040
264k
    case META_NOCAPTURE:
10041
265k
    case META_SCRIPT_RUN:
10042
265k
    nestlevel++;
10043
265k
    break;
10044
10045
1.64k
    case META_ACCEPT:
10046
198k
    case META_ALT:
10047
332k
    case META_ASTERISK:
10048
334k
    case META_ASTERISK_PLUS:
10049
344k
    case META_ASTERISK_QUERY:
10050
366k
    case META_BACKREF:
10051
406k
    case META_CIRCUMFLEX:
10052
475k
    case META_CLASS:
10053
479k
    case META_CLASS_EMPTY:
10054
482k
    case META_CLASS_EMPTY_NOT:
10055
597k
    case META_CLASS_END:
10056
643k
    case META_CLASS_NOT:
10057
645k
    case META_COMMIT:
10058
693k
    case META_DOLLAR:
10059
818k
    case META_DOT:
10060
819k
    case META_FAIL:
10061
947k
    case META_PLUS:
10062
958k
    case META_PLUS_PLUS:
10063
960k
    case META_PLUS_QUERY:
10064
962k
    case META_PRUNE:
10065
1.22M
    case META_QUERY:
10066
1.24M
    case META_QUERY_PLUS:
10067
1.24M
    case META_QUERY_QUERY:
10068
1.24M
    case META_RANGE_ESCAPED:
10069
1.25M
    case META_RANGE_LITERAL:
10070
1.25M
    case META_SKIP:
10071
1.25M
    case META_THEN:
10072
1.25M
    break;
10073
10074
0
    case META_OFFSET:
10075
88.8k
    case META_RECURSE:
10076
88.8k
    pptr += SIZEOFFSET;
10077
88.8k
    break;
10078
10079
8.27k
    case META_BACKREF_BYNAME:
10080
8.69k
    case META_RECURSE_BYNAME:
10081
8.69k
    pptr += 1 + SIZEOFFSET;
10082
8.69k
    break;
10083
10084
0
    case META_COND_DEFINE:
10085
0
    pptr += SIZEOFFSET;
10086
0
    nestlevel++;
10087
0
    break;
10088
10089
876
    case META_COND_NAME:
10090
961
    case META_COND_NUMBER:
10091
1.51k
    case META_COND_RNAME:
10092
3.71k
    case META_COND_RNUMBER:
10093
3.71k
    pptr += 1 + SIZEOFFSET;
10094
3.71k
    nestlevel++;
10095
3.71k
    break;
10096
10097
245
    case META_COND_VERSION:
10098
245
    pptr += 3;
10099
245
    nestlevel++;
10100
245
    break;
10101
10102
3.23k
    case META_CALLOUT_STRING:
10103
3.23k
    pptr += 3 + SIZEOFFSET;
10104
3.23k
    break;
10105
10106
0
    case META_BIGVALUE:
10107
3.37k
    case META_POSIX:
10108
3.63k
    case META_POSIX_NEG:
10109
3.63k
    case META_CAPTURE_NAME:
10110
3.70k
    case META_CAPTURE_NUMBER:
10111
3.70k
    pptr += 1;
10112
3.70k
    break;
10113
10114
46.5k
    case META_MINMAX:
10115
47.1k
    case META_MINMAX_QUERY:
10116
60.2k
    case META_MINMAX_PLUS:
10117
61.6k
    case META_OPTIONS:
10118
61.6k
    pptr += 2;
10119
61.6k
    break;
10120
10121
1.76M
    case META_CALLOUT_NUMBER:
10122
1.76M
    pptr += 3;
10123
1.76M
    break;
10124
10125
690
    case META_MARK:
10126
1.04k
    case META_COMMIT_ARG:
10127
1.72k
    case META_PRUNE_ARG:
10128
2.63k
    case META_SKIP_ARG:
10129
12.5k
    case META_THEN_ARG:
10130
12.5k
    pptr += 1 + pptr[1];
10131
12.5k
    break;
10132
10133
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10134
    the final ket of the group, so no need to update it here. */
10135
10136
6.16k
    case META_LOOKBEHIND:
10137
21.9k
    case META_LOOKBEHINDNOT:
10138
23.4k
    case META_LOOKBEHIND_NA:
10139
23.4k
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10140
470
      return errorcode;
10141
22.9k
    break;
10142
4.07M
    }
10143
4.07M
  }
10144
10145
4.93k
return 0;
10146
35.7k
}
10147
10148
10149
10150
/*************************************************
10151
*     External function to compile a pattern     *
10152
*************************************************/
10153
10154
/* This function reads a regular expression in the form of a string and returns
10155
a pointer to a block of store holding a compiled version of the expression.
10156
10157
Arguments:
10158
  pattern       the regular expression
10159
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10160
  options       option bits
10161
  errorptr      pointer to errorcode
10162
  erroroffset   pointer to error offset
10163
  ccontext      points to a compile context or is NULL
10164
10165
Returns:        pointer to compiled data block, or NULL on error,
10166
                with errorcode and erroroffset set
10167
*/
10168
10169
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10170
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10171
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10172
64.8k
{
10173
64.8k
BOOL utf;                             /* Set TRUE for UTF mode */
10174
64.8k
BOOL ucp;                             /* Set TRUE for UCP mode */
10175
64.8k
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10176
64.8k
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10177
64.8k
pcre2_real_code *re = NULL;           /* What we will return */
10178
64.8k
compile_block cb;                     /* "Static" compile-time data */
10179
64.8k
const uint8_t *tables;                /* Char tables base pointer */
10180
10181
64.8k
PCRE2_UCHAR null_str[1] = { 0xcd };   /* Dummy for handling null inputs */
10182
64.8k
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10183
64.8k
PCRE2_UCHAR *codestart;               /* Start of compiled code */
10184
64.8k
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10185
64.8k
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10186
10187
64.8k
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10188
64.8k
PCRE2_SIZE usedlength;                /* Actual length used */
10189
64.8k
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10190
64.8k
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10191
10192
64.8k
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10193
64.8k
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10194
64.8k
uint32_t setflags = 0;                /* NL and BSR set flags */
10195
64.8k
uint32_t xoptions;                    /* Flags from context, modified */
10196
10197
64.8k
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10198
64.8k
uint32_t limit_heap  = UINT32_MAX;
10199
64.8k
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10200
64.8k
uint32_t limit_depth = UINT32_MAX;
10201
10202
64.8k
int newline = 0;                      /* Unset; can be set by the pattern */
10203
64.8k
int bsr = 0;                          /* Unset; can be set by the pattern */
10204
64.8k
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10205
64.8k
int regexrc;                          /* Return from compile */
10206
10207
64.8k
uint32_t i;                           /* Local loop counter */
10208
10209
/* Enable all optimizations by default. */
10210
64.8k
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10211
64.8k
                                          PCRE2_OPTIMIZATION_ALL;
10212
10213
/* Comments at the head of this file explain about these variables. */
10214
10215
64.8k
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10216
64.8k
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10217
64.8k
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10218
10219
/* The workspace is used in different ways in the different compiling phases.
10220
It needs to be 16-bit aligned for the preliminary parsing scan. */
10221
10222
64.8k
uint32_t c16workspace[C16_WORK_SIZE];
10223
64.8k
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10224
10225
10226
/* -------------- Check arguments and set up the pattern ----------------- */
10227
10228
/* There must be error code and offset pointers. */
10229
10230
64.8k
if (errorptr == NULL || erroroffset == NULL) return NULL;
10231
64.8k
*errorptr = ERR0;
10232
64.8k
*erroroffset = 0;
10233
10234
/* There must be a pattern, but NULL is allowed with zero length. */
10235
10236
64.8k
if (pattern == NULL)
10237
0
  {
10238
0
  if (patlen == 0)
10239
0
    pattern = null_str;
10240
0
  else
10241
0
    {
10242
0
    *errorptr = ERR16;
10243
0
    return NULL;
10244
0
    }
10245
0
  }
10246
10247
/* A NULL compile context means "use a default context" */
10248
10249
64.8k
if (ccontext == NULL)
10250
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10251
10252
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10253
10254
64.8k
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10255
10256
/* Check that all undefined public option bits are zero. */
10257
10258
64.8k
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10259
64.8k
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10260
0
  {
10261
0
  *errorptr = ERR17;
10262
0
  return NULL;
10263
0
  }
10264
10265
64.8k
if ((options & PCRE2_LITERAL) != 0 &&
10266
64.8k
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10267
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10268
0
  {
10269
0
  *errorptr = ERR92;
10270
0
  return NULL;
10271
0
  }
10272
10273
/* A zero-terminated pattern is indicated by the special length value
10274
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10275
10276
64.8k
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10277
0
  patlen = PRIV(strlen)(pattern);
10278
64.8k
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10279
10280
64.8k
if (patlen > ccontext->max_pattern_length)
10281
0
  {
10282
0
  *errorptr = ERR88;
10283
0
  return NULL;
10284
0
  }
10285
10286
/* Optimization flags in 'options' can override those in the compile context.
10287
This is because some options to disable optimizations were added before the
10288
optimization flags word existed, and we need to continue supporting them
10289
for backwards compatibility. */
10290
10291
64.8k
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10292
15.1k
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10293
64.8k
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10294
7.13k
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10295
64.8k
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10296
13.3k
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10297
10298
/* From here on, all returns from this function should end up going via the
10299
EXIT label. */
10300
10301
10302
/* ------------ Initialize the "static" compile data -------------- */
10303
10304
64.8k
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10305
10306
64.8k
cb.lcc = tables + lcc_offset;          /* Individual */
10307
64.8k
cb.fcc = tables + fcc_offset;          /*   character */
10308
64.8k
cb.cbits = tables + cbits_offset;      /*      tables */
10309
64.8k
cb.ctypes = tables + ctypes_offset;
10310
10311
64.8k
cb.assert_depth = 0;
10312
64.8k
cb.bracount = 0;
10313
64.8k
cb.cx = ccontext;
10314
64.8k
cb.dupnames = FALSE;
10315
64.8k
cb.end_pattern = pattern + patlen;
10316
64.8k
cb.erroroffset = 0;
10317
64.8k
cb.external_flags = 0;
10318
64.8k
cb.external_options = options;
10319
64.8k
cb.groupinfo = stack_groupinfo;
10320
64.8k
cb.had_recurse = FALSE;
10321
64.8k
cb.lastcapture = 0;
10322
64.8k
cb.max_lookbehind = 0;                               /* Max encountered */
10323
64.8k
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10324
64.8k
cb.name_entry_size = 0;
10325
64.8k
cb.name_table = NULL;
10326
64.8k
cb.named_groups = named_groups;
10327
64.8k
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10328
64.8k
cb.names_found = 0;
10329
64.8k
cb.parens_depth = 0;
10330
64.8k
cb.parsed_pattern = stack_parsed_pattern;
10331
64.8k
cb.req_varyopt = 0;
10332
64.8k
cb.start_code = cworkspace;
10333
64.8k
cb.start_pattern = pattern;
10334
64.8k
cb.start_workspace = cworkspace;
10335
64.8k
cb.workspace_size = COMPILE_WORK_SIZE;
10336
64.8k
cb.first_data = NULL;
10337
64.8k
cb.last_data = NULL;
10338
64.8k
#ifdef SUPPORT_WIDE_CHARS
10339
64.8k
cb.char_lists_size = 0;
10340
64.8k
#endif
10341
10342
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10343
references to help in deciding whether (.*) can be treated as anchored or not.
10344
*/
10345
10346
64.8k
cb.top_backref = 0;
10347
64.8k
cb.backref_map = 0;
10348
10349
/* Escape sequences \1 to \9 are always back references, but as they are only
10350
two characters long, only two elements can be used in the parsed_pattern
10351
vector. The first contains the reference, and we'd like to use the second to
10352
record the offset in the pattern, so that forward references to non-existent
10353
groups can be diagnosed later with an offset. However, on 64-bit systems,
10354
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10355
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10356
references have enough space for the offset to be put into the parsed pattern.
10357
*/
10358
10359
713k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10360
10361
10362
/* --------------- Start looking at the pattern --------------- */
10363
10364
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10365
the start of the pattern, and remember the offset to the actual regex. With
10366
valgrind support, make the terminator of a zero-terminated pattern
10367
inaccessible. This catches bugs that would otherwise only show up for
10368
non-zero-terminated patterns. */
10369
10370
#ifdef SUPPORT_VALGRIND
10371
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10372
#endif
10373
10374
64.8k
xoptions = ccontext->extra_options;
10375
64.8k
ptr = pattern;
10376
64.8k
skipatstart = 0;
10377
10378
64.8k
if ((options & PCRE2_LITERAL) == 0)
10379
64.8k
  {
10380
68.0k
  while (patlen - skipatstart >= 2 &&
10381
68.0k
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10382
68.0k
         ptr[skipatstart+1] == CHAR_ASTERISK)
10383
3.84k
    {
10384
71.7k
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10385
71.0k
      {
10386
71.0k
      const pso *p = pso_list + i;
10387
10388
71.0k
      if (patlen - skipatstart - 2 >= p->length &&
10389
71.0k
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10390
3.19k
        {
10391
3.19k
        uint32_t c, pp;
10392
10393
3.19k
        skipatstart += p->length + 2;
10394
3.19k
        switch(p->type)
10395
3.19k
          {
10396
276
          case PSO_OPT:
10397
276
          cb.external_options |= p->value;
10398
276
          break;
10399
10400
0
          case PSO_XOPT:
10401
0
          xoptions |= p->value;
10402
0
          break;
10403
10404
0
          case PSO_FLG:
10405
0
          setflags |= p->value;
10406
0
          break;
10407
10408
2.91k
          case PSO_NL:
10409
2.91k
          newline = p->value;
10410
2.91k
          setflags |= PCRE2_NL_SET;
10411
2.91k
          break;
10412
10413
0
          case PSO_BSR:
10414
0
          bsr = p->value;
10415
0
          setflags |= PCRE2_BSR_SET;
10416
0
          break;
10417
10418
0
          case PSO_LIMM:
10419
0
          case PSO_LIMD:
10420
0
          case PSO_LIMH:
10421
0
          c = 0;
10422
0
          pp = skipatstart;
10423
0
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10424
0
            {
10425
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10426
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10427
0
            }
10428
0
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10429
0
            {
10430
0
            errorcode = ERR60;
10431
0
            ptr += pp;
10432
0
            goto HAD_EARLY_ERROR;
10433
0
            }
10434
0
          if (p->type == PSO_LIMH) limit_heap = c;
10435
0
            else if (p->type == PSO_LIMM) limit_match = c;
10436
0
            else limit_depth = c;
10437
0
          skipatstart = ++pp;
10438
0
          break;
10439
10440
0
          case PSO_OPTMZ:
10441
0
          optim_flags &= ~(p->value);
10442
10443
          /* For backward compatibility the three original VERBs to disable
10444
          optimizations need to also update the corresponding bit in the
10445
          external options. */
10446
10447
0
          switch(p->value)
10448
0
            {
10449
0
            case PCRE2_OPTIM_AUTO_POSSESS:
10450
0
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10451
0
            break;
10452
10453
0
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10454
0
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10455
0
            break;
10456
10457
0
            case PCRE2_OPTIM_START_OPTIMIZE:
10458
0
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10459
0
            break;
10460
0
            }
10461
10462
0
          break;
10463
10464
0
          default:
10465
          /* All values in the enum need an explicit entry for this switch
10466
          but until a better way to prevent coding mistakes is invented keep
10467
          a catch all that triggers a debug build assert as a failsafe */
10468
0
          PCRE2_DEBUG_UNREACHABLE();
10469
3.19k
          }
10470
3.19k
        break;   /* Out of the table scan loop */
10471
3.19k
        }
10472
71.0k
      }
10473
3.84k
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10474
3.84k
    }
10475
64.8k
    PCRE2_ASSERT(skipatstart <= patlen);
10476
64.8k
  }
10477
10478
/* End of pattern-start options; advance to start of real regex. */
10479
10480
64.8k
ptr += skipatstart;
10481
10482
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10483
10484
#ifndef SUPPORT_UNICODE
10485
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10486
  {
10487
  errorcode = ERR32;
10488
  goto HAD_EARLY_ERROR;
10489
  }
10490
#endif
10491
10492
/* Check UTF. We have the original options in 'options', with that value as
10493
modified by (*UTF) etc in cb->external_options. The extra option
10494
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10495
surrogate code points cannot be represented in UTF-16. */
10496
10497
64.8k
utf = (cb.external_options & PCRE2_UTF) != 0;
10498
64.8k
if (utf)
10499
16.8k
  {
10500
16.8k
  if ((options & PCRE2_NEVER_UTF) != 0)
10501
0
    {
10502
0
    errorcode = ERR74;
10503
0
    goto HAD_EARLY_ERROR;
10504
0
    }
10505
16.8k
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10506
16.8k
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10507
2.98k
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10508
10509
#if PCRE2_CODE_UNIT_WIDTH == 16
10510
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10511
    {
10512
    errorcode = ERR91;
10513
    goto HAD_EARLY_ERROR;
10514
    }
10515
#endif
10516
16.8k
  }
10517
10518
/* Check UCP lockout. */
10519
10520
61.8k
ucp = (cb.external_options & PCRE2_UCP) != 0;
10521
61.8k
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10522
0
  {
10523
0
  errorcode = ERR75;
10524
0
  goto HAD_EARLY_ERROR;
10525
0
  }
10526
10527
/* PCRE2_EXTRA_TURKISH_CASING checks */
10528
10529
61.8k
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10530
0
  {
10531
0
  if (!utf && !ucp)
10532
0
    {
10533
0
    errorcode = ERR104;
10534
0
    goto HAD_EARLY_ERROR;
10535
0
    }
10536
10537
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10538
0
  if (!utf)
10539
0
    {
10540
0
    errorcode = ERR105;
10541
0
    goto HAD_EARLY_ERROR;
10542
0
    }
10543
0
#endif
10544
10545
0
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10546
0
    {
10547
0
    errorcode = ERR106;
10548
0
    goto HAD_EARLY_ERROR;
10549
0
    }
10550
0
  }
10551
10552
/* Process the BSR setting. */
10553
10554
61.8k
if (bsr == 0) bsr = ccontext->bsr_convention;
10555
10556
/* Process the newline setting. */
10557
10558
61.8k
if (newline == 0) newline = ccontext->newline_convention;
10559
61.8k
cb.nltype = NLTYPE_FIXED;
10560
61.8k
switch(newline)
10561
61.8k
  {
10562
21
  case PCRE2_NEWLINE_CR:
10563
21
  cb.nllen = 1;
10564
21
  cb.nl[0] = CHAR_CR;
10565
21
  break;
10566
10567
59.1k
  case PCRE2_NEWLINE_LF:
10568
59.1k
  cb.nllen = 1;
10569
59.1k
  cb.nl[0] = CHAR_NL;
10570
59.1k
  break;
10571
10572
0
  case PCRE2_NEWLINE_NUL:
10573
0
  cb.nllen = 1;
10574
0
  cb.nl[0] = CHAR_NUL;
10575
0
  break;
10576
10577
954
  case PCRE2_NEWLINE_CRLF:
10578
954
  cb.nllen = 2;
10579
954
  cb.nl[0] = CHAR_CR;
10580
954
  cb.nl[1] = CHAR_NL;
10581
954
  break;
10582
10583
1.26k
  case PCRE2_NEWLINE_ANY:
10584
1.26k
  cb.nltype = NLTYPE_ANY;
10585
1.26k
  break;
10586
10587
524
  case PCRE2_NEWLINE_ANYCRLF:
10588
524
  cb.nltype = NLTYPE_ANYCRLF;
10589
524
  break;
10590
10591
0
  default:
10592
0
  PCRE2_DEBUG_UNREACHABLE();
10593
0
  errorcode = ERR56;
10594
0
  goto HAD_EARLY_ERROR;
10595
61.8k
  }
10596
10597
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10598
their numerical equivalents, so that this information is always available for
10599
the remaining processing. (2) At the same time, parse the pattern and put a
10600
processed version into the parsed_pattern vector. This has escapes interpreted
10601
and comments removed (amongst other things). */
10602
10603
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10604
patterns the vector on the stack (which was set up above) can be used. */
10605
10606
61.8k
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10607
10608
/* Allow for 2x uint32_t at the start and 2 at the end, for
10609
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10610
10611
61.8k
if ((ccontext->extra_options &
10612
61.8k
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10613
0
  parsed_size_needed += 4;
10614
10615
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10616
10617
61.8k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10618
10.4k
  parsed_size_needed += 4;
10619
10620
61.8k
parsed_size_needed += 1;  /* For the final META_END */
10621
10622
61.8k
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10623
4.93k
  {
10624
4.93k
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10625
4.93k
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10626
4.93k
  if (heap_parsed_pattern == NULL)
10627
0
    {
10628
0
    *errorptr = ERR21;
10629
0
    goto EXIT;
10630
0
    }
10631
4.93k
  cb.parsed_pattern = heap_parsed_pattern;
10632
4.93k
  }
10633
61.8k
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10634
10635
/* Do the parsing scan. */
10636
10637
61.8k
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10638
61.8k
if (errorcode != 0) goto HAD_CB_ERROR;
10639
10640
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10641
lengths. Workspace is needed to remember whether numbered groups are or are not
10642
of limited length, and if limited, what the minimum and maximum lengths are.
10643
This caching saves re-computing the length of any group that is referenced more
10644
than once, which is particularly relevant when recursion is involved.
10645
Unnumbered groups do not have this exposure because they cannot be referenced.
10646
If there are sufficiently few groups, the default index vector on the stack, as
10647
set up above, can be used. Otherwise we have to get/free some heap memory. The
10648
vector must be initialized to zero. */
10649
10650
56.5k
if (has_lookbehind)
10651
5.35k
  {
10652
5.35k
  int loopcount = 0;
10653
5.35k
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10654
103
    {
10655
103
    cb.groupinfo = ccontext->memctl.malloc(
10656
103
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10657
103
    if (cb.groupinfo == NULL)
10658
0
      {
10659
0
      errorcode = ERR21;
10660
0
      cb.erroroffset = 0;
10661
0
      goto HAD_CB_ERROR;
10662
0
      }
10663
103
    }
10664
5.35k
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10665
5.35k
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10666
5.35k
  if (errorcode != 0) goto HAD_CB_ERROR;
10667
5.35k
  }
10668
10669
/* For debugging, there is a function that shows the parsed pattern vector. */
10670
10671
#ifdef DEBUG_SHOW_PARSED
10672
fprintf(stderr, "+++ Pre-scan complete:\n");
10673
show_parsed(&cb);
10674
#endif
10675
10676
/* For debugging capturing information this code can be enabled. */
10677
10678
#ifdef DEBUG_SHOW_CAPTURES
10679
  {
10680
  named_group *ng = cb.named_groups;
10681
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10682
  for (i = 0; i < cb.names_found; i++, ng++)
10683
    {
10684
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10685
    }
10686
  }
10687
#endif
10688
10689
/* Pretend to compile the pattern while actually just accumulating the amount
10690
of memory required in the 'length' variable. This behaviour is triggered by
10691
passing a non-NULL final argument to compile_regex(). We pass a block of
10692
workspace (cworkspace) for it to compile parts of the pattern into; the
10693
compiled code is discarded when it is no longer needed, so hopefully this
10694
workspace will never overflow, though there is a test for its doing so.
10695
10696
On error, errorcode will be set non-zero, so we don't need to look at the
10697
result of the function. The initial options have been put into the cb block,
10698
but we still have to pass a separate options variable (the first argument)
10699
because the options may change as the pattern is processed. */
10700
10701
56.0k
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10702
56.0k
pptr = cb.parsed_pattern;
10703
56.0k
code = cworkspace;
10704
56.0k
*code = OP_BRA;
10705
10706
56.0k
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10707
56.0k
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10708
56.0k
   &cb, &length);
10709
10710
56.0k
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10711
10712
/* This should be caught in compile_regex(), but just in case... */
10713
10714
54.9k
#if defined SUPPORT_WIDE_CHARS
10715
54.9k
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10716
54.9k
if (length > MAX_PATTERN_SIZE ||
10717
54.9k
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10718
#else
10719
if (length > MAX_PATTERN_SIZE)
10720
#endif
10721
0
  {
10722
0
  errorcode = ERR20;
10723
0
  goto HAD_CB_ERROR;
10724
0
  }
10725
10726
/* Compute the size of, then, if not too large, get and initialize the data
10727
block for storing the compiled pattern and names table. Integer overflow should
10728
no longer be possible because nowadays we limit the maximum value of
10729
cb.names_found and cb.name_entry_size. */
10730
10731
54.9k
re_blocksize =
10732
54.9k
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10733
10734
54.9k
#if defined SUPPORT_WIDE_CHARS
10735
54.9k
if (cb.char_lists_size != 0)
10736
724
  {
10737
724
#if PCRE2_CODE_UNIT_WIDTH != 32
10738
  /* Align to 32 bit first. This ensures the
10739
  allocated area will also be 32 bit aligned. */
10740
724
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10741
724
#endif
10742
724
  re_blocksize += cb.char_lists_size;
10743
724
  }
10744
54.9k
#endif
10745
10746
54.9k
re_blocksize += CU2BYTES(length);
10747
10748
54.9k
if (re_blocksize > ccontext->max_pattern_compiled_length)
10749
64
  {
10750
64
  errorcode = ERR101;
10751
64
  goto HAD_CB_ERROR;
10752
64
  }
10753
10754
54.9k
re_blocksize += sizeof(pcre2_real_code);
10755
54.9k
re = (pcre2_real_code *)
10756
54.9k
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10757
54.9k
if (re == NULL)
10758
0
  {
10759
0
  errorcode = ERR21;
10760
0
  goto HAD_CB_ERROR;
10761
0
  }
10762
10763
/* The compiler may put padding at the end of the pcre2_real_code structure in
10764
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10765
compiled pattern is copied (for example, when serialized) undefined bytes are
10766
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10767
write to the last 8 bytes of the structure before setting the fields. */
10768
10769
54.9k
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10770
54.9k
re->memctl = ccontext->memctl;
10771
54.9k
re->tables = tables;
10772
54.9k
re->executable_jit = NULL;
10773
54.9k
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10774
54.9k
re->blocksize = re_blocksize;
10775
54.9k
re->code_start = re_blocksize - CU2BYTES(length);
10776
54.9k
re->magic_number = MAGIC_NUMBER;
10777
54.9k
re->compile_options = options;
10778
54.9k
re->overall_options = cb.external_options;
10779
54.9k
re->extra_options = xoptions;
10780
54.9k
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10781
54.9k
re->limit_heap = limit_heap;
10782
54.9k
re->limit_match = limit_match;
10783
54.9k
re->limit_depth = limit_depth;
10784
54.9k
re->first_codeunit = 0;
10785
54.9k
re->last_codeunit = 0;
10786
54.9k
re->bsr_convention = bsr;
10787
54.9k
re->newline_convention = newline;
10788
54.9k
re->max_lookbehind = 0;
10789
54.9k
re->minlength = 0;
10790
54.9k
re->top_bracket = 0;
10791
54.9k
re->top_backref = 0;
10792
54.9k
re->name_entry_size = cb.name_entry_size;
10793
54.9k
re->name_count = cb.names_found;
10794
54.9k
re->optimization_flags = optim_flags;
10795
10796
/* The basic block is immediately followed by the name table, and the compiled
10797
code follows after that. */
10798
10799
54.9k
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10800
10801
/* Update the compile data block for the actual compile. The starting points of
10802
the name/number translation table and of the code are passed around in the
10803
compile data block. The start/end pattern and initial options are already set
10804
from the pre-compile phase, as is the name_entry_size field. */
10805
10806
54.9k
cb.parens_depth = 0;
10807
54.9k
cb.assert_depth = 0;
10808
54.9k
cb.lastcapture = 0;
10809
54.9k
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10810
54.9k
cb.start_code = codestart;
10811
54.9k
cb.req_varyopt = 0;
10812
54.9k
cb.had_accept = FALSE;
10813
54.9k
cb.had_pruneorskip = FALSE;
10814
54.9k
#ifdef SUPPORT_WIDE_CHARS
10815
54.9k
cb.char_lists_size = 0;
10816
54.9k
#endif
10817
10818
10819
/* If any named groups were found, create the name/number table from the list
10820
created in the pre-pass. */
10821
10822
54.9k
if (cb.names_found > 0)
10823
1.09k
  {
10824
1.09k
  named_group *ng = cb.named_groups;
10825
1.09k
  uint32_t tablecount = 0;
10826
10827
  /* Length 0 represents duplicates, and they have already been handled. */
10828
10.0k
  for (i = 0; i < cb.names_found; i++, ng++)
10829
8.96k
    if (ng->length > 0)
10830
1.87k
      tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);
10831
10832
1.09k
  PCRE2_ASSERT(tablecount == cb.names_found);
10833
1.09k
  }
10834
10835
/* Set up a starting, non-extracting bracket, then compile the expression. On
10836
error, errorcode will be set non-zero, so we don't need to look at the result
10837
of the function here. */
10838
10839
54.9k
pptr = cb.parsed_pattern;
10840
54.9k
code = (PCRE2_UCHAR *)codestart;
10841
54.9k
*code = OP_BRA;
10842
54.9k
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10843
54.9k
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10844
54.9k
  NULL, &cb, NULL);
10845
54.9k
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10846
54.9k
re->top_bracket = cb.bracount;
10847
54.9k
re->top_backref = cb.top_backref;
10848
54.9k
re->max_lookbehind = cb.max_lookbehind;
10849
10850
54.9k
if (cb.had_accept)
10851
2.46k
  {
10852
2.46k
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10853
2.46k
  reqcuflags = REQ_NONE;
10854
2.46k
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10855
2.46k
  }
10856
10857
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10858
but the estimated length exceeds the really used length, adjust the value of
10859
re->blocksize, and if valgrind support is configured, mark the extra allocated
10860
memory as unaddressable, so that any out-of-bound reads can be detected. */
10861
10862
54.9k
*code++ = OP_END;
10863
54.9k
usedlength = code - codestart;
10864
54.9k
if (usedlength > length)
10865
0
  {
10866
0
  PCRE2_DEBUG_UNREACHABLE();
10867
0
  errorcode = ERR23;  /* Overflow of code block - internal error */
10868
0
  }
10869
54.9k
else
10870
54.9k
  {
10871
54.9k
  re->blocksize -= CU2BYTES(length - usedlength);
10872
#ifdef SUPPORT_VALGRIND
10873
  VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10874
#endif
10875
54.9k
  }
10876
10877
/* Scan the pattern for recursion/subroutine calls and convert the group
10878
numbers into offsets. Maintain a small cache so that repeated groups containing
10879
recursions are efficiently handled. */
10880
10881
85.3k
#define RSCAN_CACHE_SIZE 8
10882
10883
54.9k
if (errorcode == 0 && cb.had_recurse)
10884
8.09k
  {
10885
8.09k
  PCRE2_UCHAR *rcode;
10886
8.09k
  PCRE2_SPTR rgroup;
10887
8.09k
  unsigned int ccount = 0;
10888
8.09k
  int start = RSCAN_CACHE_SIZE;
10889
8.09k
  recurse_cache rc[RSCAN_CACHE_SIZE];
10890
10891
8.09k
  for (rcode = find_recurse(codestart, utf);
10892
485k
       rcode != NULL;
10893
477k
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
10894
477k
    {
10895
477k
    int p, groupnumber;
10896
10897
477k
    groupnumber = (int)GET(rcode, 1);
10898
477k
    if (groupnumber == 0) rgroup = codestart; else
10899
161k
      {
10900
161k
      PCRE2_SPTR search_from = codestart;
10901
161k
      rgroup = NULL;
10902
705k
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
10903
636k
        {
10904
636k
        if (groupnumber == rc[p].groupnumber)
10905
92.2k
          {
10906
92.2k
          rgroup = rc[p].group;
10907
92.2k
          break;
10908
92.2k
          }
10909
10910
        /* Group n+1 must always start to the right of group n, so we can save
10911
        search time below when the new group number is greater than any of the
10912
        previously found groups. */
10913
10914
544k
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
10915
544k
        }
10916
10917
161k
      if (rgroup == NULL)
10918
69.2k
        {
10919
69.2k
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
10920
69.2k
        if (rgroup == NULL)
10921
0
          {
10922
0
          PCRE2_DEBUG_UNREACHABLE();
10923
0
          errorcode = ERR53;
10924
0
          break;
10925
0
          }
10926
69.2k
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
10927
69.2k
        rc[start].groupnumber = groupnumber;
10928
69.2k
        rc[start].group = rgroup;
10929
69.2k
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
10930
69.2k
        }
10931
161k
      }
10932
10933
477k
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
10934
477k
    }
10935
8.09k
  }
10936
10937
/* In rare debugging situations we sometimes need to look at the compiled code
10938
at this stage. */
10939
10940
#ifdef DEBUG_CALL_PRINTINT
10941
pcre2_printint(re, stderr, TRUE);
10942
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
10943
#endif
10944
10945
/* Unless disabled, check whether any single character iterators can be
10946
auto-possessified. The function overwrites the appropriate opcode values, so
10947
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
10948
used in this code because at least one compiler gives a warning about loss of
10949
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
10950
function call. */
10951
10952
54.9k
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
10953
42.4k
  {
10954
42.4k
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
10955
42.4k
  if (PRIV(auto_possessify)(temp, &cb) != 0)
10956
0
    {
10957
0
    PCRE2_DEBUG_UNREACHABLE();
10958
0
    errorcode = ERR80;
10959
0
    }
10960
42.4k
  }
10961
10962
/* Failed to compile, or error while post-processing. */
10963
10964
54.9k
if (errorcode != 0) goto HAD_CB_ERROR;
10965
10966
/* Successful compile. If the anchored option was not passed, set it if
10967
we can determine that the pattern is anchored by virtue of ^ characters or \A
10968
or anything else, such as starting with non-atomic .* when DOTALL is set and
10969
there are no occurrences of *PRUNE or *SKIP (though there is an option to
10970
disable this case). */
10971
10972
54.8k
if ((re->overall_options & PCRE2_ANCHORED) == 0)
10973
51.5k
  {
10974
51.5k
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
10975
51.5k
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
10976
378
    re->overall_options |= PCRE2_ANCHORED;
10977
51.5k
  }
10978
10979
/* Set up the first code unit or startline flag, the required code unit, and
10980
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
10981
is disabled, as the data it would create will not be used. Note that a first code
10982
unit (but not the startline flag) is useful for anchored patterns because it
10983
can still give a quick "no match" and also avoid searching for a last code
10984
unit. */
10985
10986
54.8k
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
10987
44.0k
  {
10988
44.0k
  int minminlength = 0;  /* For minimal minlength from first/required CU */
10989
10990
  /* If we do not have a first code unit, see if there is one that is asserted
10991
  (these are not saved during the compile because they can cause conflicts with
10992
  actual literals that follow). */
10993
10994
44.0k
  if (firstcuflags >= REQ_NONE) {
10995
31.6k
    uint32_t assertedcuflags = 0;
10996
31.6k
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
10997
    /* It would be wrong to use the asserted first code unit as `firstcu` for
10998
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
10999
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
11000
     * the subject string needs to be at least 2 characters long, which is wrong.
11001
     * With more analysis, we would be able to set firstcu in more cases. */
11002
31.6k
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
11003
83
      firstcu = assertedcu;
11004
83
      firstcuflags = assertedcuflags;
11005
83
    }
11006
31.6k
  }
11007
11008
  /* Save the data for a first code unit. The existence of one means the
11009
  minimum length must be at least 1. */
11010
11011
44.0k
  if (firstcuflags < REQ_NONE)
11012
12.4k
    {
11013
12.4k
    re->first_codeunit = firstcu;
11014
12.4k
    re->flags |= PCRE2_FIRSTSET;
11015
12.4k
    minminlength++;
11016
11017
    /* Handle caseless first code units. */
11018
11019
12.4k
    if ((firstcuflags & REQ_CASELESS) != 0)
11020
2.22k
      {
11021
2.22k
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
11022
2.03k
        {
11023
2.03k
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
11024
2.03k
        }
11025
11026
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
11027
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
11028
      units and cannot have another case, but if UCP is set they may do. */
11029
11030
182
#ifdef SUPPORT_UNICODE
11031
182
#if PCRE2_CODE_UNIT_WIDTH == 8
11032
182
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
11033
118
        re->flags |= PCRE2_FIRSTCASELESS;
11034
#else
11035
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
11036
               UCD_OTHERCASE(firstcu) != firstcu)
11037
        re->flags |= PCRE2_FIRSTCASELESS;
11038
#endif
11039
2.22k
#endif  /* SUPPORT_UNICODE */
11040
2.22k
      }
11041
12.4k
    }
11042
11043
  /* When there is no first code unit, for non-anchored patterns, see if we can
11044
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
11045
  branches start with ^ and also when all branches start with non-atomic .* for
11046
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
11047
  that disables this case.) */
11048
11049
31.5k
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
11050
30.6k
    {
11051
30.6k
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11052
30.6k
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11053
757
      re->flags |= PCRE2_STARTLINE;
11054
30.6k
    }
11055
11056
  /* Handle the "required code unit", if one is set. In the UTF case we can
11057
  increment the minimum minimum length only if we are sure this really is a
11058
  different character and not a non-starting code unit of the first character,
11059
  because the minimum length count is in characters, not code units. */
11060
11061
44.0k
  if (reqcuflags < REQ_NONE)
11062
21.8k
    {
11063
#if PCRE2_CODE_UNIT_WIDTH == 16
11064
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11065
        firstcuflags >= REQ_NONE ||                 /* First not set */
11066
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
11067
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
11068
#elif PCRE2_CODE_UNIT_WIDTH == 8
11069
21.8k
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11070
21.8k
        firstcuflags >= REQ_NONE ||                 /* First not set */
11071
21.8k
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
11072
21.8k
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
11073
21.8k
#endif
11074
21.8k
      {
11075
21.8k
      minminlength++;
11076
21.8k
      }
11077
11078
    /* In the case of an anchored pattern, set up the value only if it follows
11079
    a variable length item in the pattern. */
11080
11081
21.8k
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
11082
21.8k
        (reqcuflags & REQ_VARY) != 0)
11083
21.6k
      {
11084
21.6k
      re->last_codeunit = reqcu;
11085
21.6k
      re->flags |= PCRE2_LASTSET;
11086
11087
      /* Handle caseless required code units as for first code units (above). */
11088
11089
21.6k
      if ((reqcuflags & REQ_CASELESS) != 0)
11090
4.20k
        {
11091
4.20k
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
11092
3.80k
          {
11093
3.80k
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11094
3.80k
          }
11095
403
#ifdef SUPPORT_UNICODE
11096
403
#if PCRE2_CODE_UNIT_WIDTH == 8
11097
403
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11098
202
        re->flags |= PCRE2_LASTCASELESS;
11099
#else
11100
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11101
               UCD_OTHERCASE(reqcu) != reqcu)
11102
        re->flags |= PCRE2_LASTCASELESS;
11103
#endif
11104
4.20k
#endif  /* SUPPORT_UNICODE */
11105
4.20k
        }
11106
21.6k
      }
11107
21.8k
    }
11108
11109
  /* Study the compiled pattern to set up information such as a bitmap of
11110
  starting code units and a minimum matching length. */
11111
11112
44.0k
  if (PRIV(study)(re) != 0)
11113
0
    {
11114
0
    PCRE2_DEBUG_UNREACHABLE();
11115
0
    errorcode = ERR31;
11116
0
    goto HAD_CB_ERROR;
11117
0
    }
11118
11119
  /* If study() set a bitmap of starting code units, it implies a minimum
11120
  length of at least one. */
11121
11122
44.0k
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11123
5.68k
    minminlength = 1;
11124
11125
  /* If the minimum length set (or not set) by study() is less than the minimum
11126
  implied by required code units, override it. */
11127
11128
44.0k
  if (re->minlength < minminlength) re->minlength = minminlength;
11129
44.0k
  }   /* End of start-of-match optimizations. */
11130
11131
/* Control ends up here in all cases. When running under valgrind, make a
11132
pattern's terminating zero defined again. If memory was obtained for the parsed
11133
version of the pattern, free it before returning. Also free the list of named
11134
groups if a larger one had to be obtained, and likewise the group information
11135
vector. */
11136
11137
54.8k
#ifdef SUPPORT_UNICODE
11138
/* All items must be freed. */
11139
54.8k
PCRE2_ASSERT(cb.first_data == NULL);
11140
54.8k
#endif
11141
11142
64.8k
EXIT:
11143
#ifdef SUPPORT_VALGRIND
11144
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11145
#endif
11146
64.8k
if (cb.parsed_pattern != stack_parsed_pattern)
11147
4.93k
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11148
64.8k
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11149
156
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11150
64.8k
if (cb.groupinfo != stack_groupinfo)
11151
103
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11152
11153
64.8k
return re;    /* Will be NULL after an error */
11154
11155
/* Errors discovered in parse_regex() set the offset value in the compile
11156
block. Errors discovered before it is called must compute it from the ptr
11157
value. After parse_regex() is called, the offset in the compile block is set to
11158
the end of the pattern, but certain errors in compile_regex() may reset it if
11159
an offset is available in the parsed pattern. */
11160
11161
6.98k
HAD_CB_ERROR:
11162
6.98k
ptr = pattern + cb.erroroffset;
11163
11164
6.98k
HAD_EARLY_ERROR:
11165
6.98k
PCRE2_ASSERT(ptr >= pattern); /* Ensure we don't return invalid erroroffset */
11166
6.98k
PCRE2_ASSERT(ptr <= (pattern + patlen));
11167
6.98k
*erroroffset = ptr - pattern;
11168
11169
9.97k
HAD_ERROR:
11170
9.97k
*errorptr = errorcode;
11171
9.97k
pcre2_code_free(re);
11172
9.97k
re = NULL;
11173
11174
9.97k
if (cb.first_data != NULL)
11175
81
  {
11176
81
  compile_data* current_data = cb.first_data;
11177
81
  do
11178
885
    {
11179
885
    compile_data* next_data = current_data->next;
11180
885
    cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);
11181
885
    current_data = next_data;
11182
885
    }
11183
885
  while (current_data != NULL);
11184
81
  }
11185
11186
9.97k
goto EXIT;
11187
6.98k
}
11188
11189
/* These #undefs are here to enable unity builds with CMake. */
11190
11191
#undef NLBLOCK /* Block containing newline information */
11192
#undef PSSTART /* Field containing processed string start */
11193
#undef PSEND   /* Field containing processed string end */
11194
11195
/* End of pcre2_compile.c */