Coverage Report

Created: 2026-02-14 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/pcre2/src/pcre2_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
448k
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
120k
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
/* In rare error cases debugging might require calling pcre2_printint(). */
51
52
#if 0
53
#ifdef EBCDIC
54
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
55
#else
56
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
57
#endif
58
#define CHAR_OUTPUT(c)      (c)
59
#define CHAR_OUTPUT_HEX(c)  (c)
60
#define CHAR_INPUT(c)       (c)
61
#define CHAR_INPUT_HEX(c)   (c)
62
#include "pcre2_printint_inc.h"
63
#undef PRINTABLE
64
#undef CHAR_OUTPUT
65
#undef CHAR_OUTPUT_HEX
66
#undef CHAR_INPUT
67
#define DEBUG_CALL_PRINTINT
68
#endif
69
70
/* Other debugging code can be enabled by these defines. */
71
72
/* #define DEBUG_SHOW_CAPTURES */
73
/* #define DEBUG_SHOW_PARSED */
74
75
/* There are a few things that vary with different code unit sizes. Handle them
76
by defining macros in order to minimize #if usage. */
77
78
#if PCRE2_CODE_UNIT_WIDTH == 8
79
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
80
16.7k
#define XDIGIT(c)                xdigitab[c]
81
82
#else  /* Either 16-bit or 32-bit */
83
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
84
85
#if PCRE2_CODE_UNIT_WIDTH == 16
86
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
87
88
#else  /* 32-bit */
89
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
90
#endif
91
#endif
92
93
/* Function definitions to allow mutual recursion */
94
95
static int
96
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
97
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
98
    open_capitem *, compile_block *, PCRE2_SIZE *);
99
100
static int
101
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
102
    compile_block *);
103
104
static BOOL
105
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
106
    compile_block *);
107
108
static int
109
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
110
    compile_block *, int *);
111
112
113
/*************************************************
114
*      Code parameters and static tables         *
115
*************************************************/
116
117
934k
#define MAX_GROUP_NUMBER   65535u
118
2.21M
#define MAX_REPEAT_COUNT   65535u
119
1.96M
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
120
121
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
122
different ways in the different pattern scans. The parsing and group-
123
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
124
aligned for this. Having defined the size in code units, we set up
125
C16_WORK_SIZE as the number of elements in the 16-bit vector.
126
127
During the first compiling phase, when determining how much memory is required,
128
the regex is partly compiled into this space, but the compiled parts are
129
discarded as soon as they can be, so that hopefully there will never be an
130
overrun. The code does, however, check for an overrun, which can occur for
131
pathological patterns. The size of the workspace depends on LINK_SIZE because
132
the length of compiled items varies with this.
133
134
In the real compile phase, this workspace is not currently used. */
135
136
87.5k
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
137
138
#define C16_WORK_SIZE \
139
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
140
141
/* A uint32_t vector is used for caching information about the size of
142
capturing groups, to improve performance. A default is created on the stack of
143
this size. */
144
145
6.24k
#define GROUPINFO_DEFAULT_SIZE 256
146
147
/* The overrun tests check for a slightly smaller size so that they detect the
148
overrun before it actually does run off the end of the data block. */
149
150
7.75M
#define WORK_SIZE_SAFETY_MARGIN (100)
151
152
/* This value determines the size of the initial vector that is used for
153
remembering named groups during the pre-compile. It is allocated on the stack,
154
but if it is too small, it is expanded, in a similar way to the workspace. The
155
value is the number of slots in the list. */
156
157
175k
#define NAMED_GROUP_LIST_SIZE  20
158
159
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
160
of uint32_t. For short patterns this lives on the stack, with this size. Heap
161
memory is used for longer patterns. */
162
163
84.5k
#define PARSED_PATTERN_DEFAULT_SIZE 1024
164
165
/* Maximum length value to check against when making sure that the variable
166
that holds the compiled pattern length does not overflow. We make it a bit less
167
than INT_MAX to allow for adding in group terminating code units, so that we
168
don't have to check them every time. */
169
170
8.52M
#define OFLOW_MAX (INT_MAX - 20)
171
172
/* Table of extra lengths for each of the meta codes. Must be kept in step with
173
the definitions above. For some items these values are a basic length to which
174
a variable amount has to be added. */
175
176
static unsigned char meta_extra_lengths[] = {
177
  0,             /* META_END */
178
  0,             /* META_ALT */
179
  0,             /* META_ATOMIC */
180
  0,             /* META_BACKREF - more if group is >= 10 */
181
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
182
  1,             /* META_BIGVALUE */
183
  3,             /* META_CALLOUT_NUMBER */
184
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
185
  0,             /* META_CAPTURE */
186
  0,             /* META_CIRCUMFLEX */
187
  0,             /* META_CLASS */
188
  0,             /* META_CLASS_EMPTY */
189
  0,             /* META_CLASS_EMPTY_NOT */
190
  0,             /* META_CLASS_END */
191
  0,             /* META_CLASS_NOT */
192
  0,             /* META_COND_ASSERT */
193
  SIZEOFFSET,    /* META_COND_DEFINE */
194
  1+SIZEOFFSET,  /* META_COND_NAME */
195
  1+SIZEOFFSET,  /* META_COND_NUMBER */
196
  1+SIZEOFFSET,  /* META_COND_RNAME */
197
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
198
  3,             /* META_COND_VERSION */
199
  SIZEOFFSET,    /* META_OFFSET */
200
  0,             /* META_SCS */
201
  1,             /* META_CAPTURE_NAME */
202
  1,             /* META_CAPTURE_NUMBER */
203
  0,             /* META_DOLLAR */
204
  0,             /* META_DOT */
205
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
206
  0,             /* META_KET */
207
  0,             /* META_NOCAPTURE */
208
  2,             /* META_OPTIONS */
209
  1,             /* META_POSIX */
210
  1,             /* META_POSIX_NEG */
211
  0,             /* META_RANGE_ESCAPED */
212
  0,             /* META_RANGE_LITERAL */
213
  SIZEOFFSET,    /* META_RECURSE */
214
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
215
  0,             /* META_SCRIPT_RUN */
216
  0,             /* META_LOOKAHEAD */
217
  0,             /* META_LOOKAHEADNOT */
218
  SIZEOFFSET,    /* META_LOOKBEHIND */
219
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
220
  0,             /* META_LOOKAHEAD_NA */
221
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
222
  1,             /* META_MARK - plus the string length */
223
  0,             /* META_ACCEPT */
224
  0,             /* META_FAIL */
225
  0,             /* META_COMMIT */
226
  1,             /* META_COMMIT_ARG - plus the string length */
227
  0,             /* META_PRUNE */
228
  1,             /* META_PRUNE_ARG - plus the string length */
229
  0,             /* META_SKIP */
230
  1,             /* META_SKIP_ARG - plus the string length */
231
  0,             /* META_THEN */
232
  1,             /* META_THEN_ARG - plus the string length */
233
  0,             /* META_ASTERISK */
234
  0,             /* META_ASTERISK_PLUS */
235
  0,             /* META_ASTERISK_QUERY */
236
  0,             /* META_PLUS */
237
  0,             /* META_PLUS_PLUS */
238
  0,             /* META_PLUS_QUERY */
239
  0,             /* META_QUERY */
240
  0,             /* META_QUERY_PLUS */
241
  0,             /* META_QUERY_QUERY */
242
  2,             /* META_MINMAX */
243
  2,             /* META_MINMAX_PLUS */
244
  2,             /* META_MINMAX_QUERY */
245
  0,             /* META_ECLASS_AND */
246
  0,             /* META_ECLASS_OR */
247
  0,             /* META_ECLASS_SUB */
248
  0,             /* META_ECLASS_XOR */
249
  0              /* META_ECLASS_NOT */
250
};
251
252
/* Types for skipping parts of a parsed pattern. */
253
254
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
255
256
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
257
variables, which are concerned with first and required code units. A value
258
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
259
matching xxcu variable is set, and the low valued bits are relevant. */
260
261
17.4M
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
262
5.86M
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
263
718k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
264
2.54M
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
265
266
/* These flags are used in the groupinfo vector. */
267
268
157k
#define GI_SET_FIXED_LENGTH    0x80000000u
269
8.13k
#define GI_NOT_FIXED_LENGTH    0x40000000u
270
1.15k
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
271
272
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
273
and is fast (a good compiler can turn it into a subtraction and unsigned
274
comparison). */
275
276
2.00M
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
277
278
/* Table to identify hex digits. The tables in chartables are dependent on the
279
locale, and may mark arbitrary characters as digits. We want to recognize only
280
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
281
costs 256 bytes, but it is a lot faster than doing character value tests (at
282
least in some simple cases I timed), and in some applications one wants PCRE2
283
to compile efficiently as well as match efficiently. The value in the table is
284
the binary hex digit value, or 0xff for non-hex digits. */
285
286
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
287
UTF-8 mode. */
288
289
#ifndef EBCDIC
290
static const uint8_t xdigitab[] =
291
  {
292
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
293
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
294
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
295
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
296
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
297
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
298
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
299
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
300
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
301
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
302
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
303
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
304
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
305
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
306
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
307
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
308
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
309
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
316
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
317
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
318
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
322
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
324
325
#else
326
327
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
328
329
static const uint8_t xdigitab[] =
330
  {
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
342
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
343
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
344
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
345
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
346
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
347
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
348
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
355
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
361
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
362
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
363
#endif  /* EBCDIC */
364
365
366
/* Table for handling alphanumeric escaped characters. Positive returns are
367
simple data values; negative values are for special things like \d and so on.
368
Zero means further processing is needed (for things like \x), or the escape is
369
invalid. */
370
371
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
372
in UTF-8 mode. It runs from '0' to 'z'. */
373
374
#ifndef EBCDIC
375
980k
#define ESCAPES_FIRST       CHAR_0
376
484k
#define ESCAPES_LAST        CHAR_z
377
341
#define UPPER_CASE(c)       (c-32)
378
379
static const short int escapes[] = {
380
    /* 0 */ 0,                       /* 1 */ 0,
381
    /* 2 */ 0,                       /* 3 */ 0,
382
    /* 4 */ 0,                       /* 5 */ 0,
383
    /* 6 */ 0,                       /* 7 */ 0,
384
    /* 8 */ 0,                       /* 9 */ 0,
385
    /* : */ ESCAPES_FIRST+0x0a,      /* ; */ ESCAPES_FIRST+0x0b,
386
    /* < */ ESCAPES_FIRST+0x0c,      /* = */ ESCAPES_FIRST+0x0d,
387
    /* > */ ESCAPES_FIRST+0x0e,      /* ? */ ESCAPES_FIRST+0x0f,
388
    /* @ */ ESCAPES_FIRST+0x10,      /* A */ -ESC_A,
389
    /* B */ -ESC_B,                  /* C */ -ESC_C,
390
    /* D */ -ESC_D,                  /* E */ -ESC_E,
391
    /* F */ 0,                       /* G */ -ESC_G,
392
    /* H */ -ESC_H,                  /* I */ 0,
393
    /* J */ 0,                       /* K */ -ESC_K,
394
    /* L */ 0,                       /* M */ 0,
395
    /* N */ -ESC_N,                  /* O */ 0,
396
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
397
    /* R */ -ESC_R,                  /* S */ -ESC_S,
398
    /* T */ 0,                       /* U */ 0,
399
    /* V */ -ESC_V,                  /* W */ -ESC_W,
400
    /* X */ -ESC_X,                  /* Y */ 0,
401
    /* Z */ -ESC_Z,                  /* [ */ ESCAPES_FIRST+0x2b,
402
    /* \ */ ESCAPES_FIRST+0x2c,      /* ] */ ESCAPES_FIRST+0x2d,
403
    /* ^ */ ESCAPES_FIRST+0x2e,      /* _ */ ESCAPES_FIRST+0x2f,
404
    /* ` */ ESCAPES_FIRST+0x30,      /* a */ CHAR_BEL,
405
    /* b */ -ESC_b,                  /* c */ 0,
406
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
407
    /* f */ CHAR_FF,                 /* g */ 0,
408
    /* h */ -ESC_h,                  /* i */ 0,
409
    /* j */ 0,                       /* k */ -ESC_k,
410
    /* l */ 0,                       /* m */ 0,
411
    /* n */ CHAR_LF,                 /* o */ 0,
412
    /* p */ -ESC_p,                  /* q */ 0,
413
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
414
    /* t */ CHAR_HT,                 /* u */ 0,
415
    /* v */ -ESC_v,                  /* w */ -ESC_w,
416
    /* x */ 0,                       /* y */ 0,
417
    /* z */ -ESC_z
418
};
419
420
#else
421
422
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
423
It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,
424
which can interpret character literals like 'a' or '[' in an EBCDIC codepage;
425
in this case, there is wide variance between codepages on the interpretation of
426
characters between the letters ('[' and '{' and so on are placed in all sorts of
427
different positions in the table). Thankfully however, all EBCDIC codepages
428
place the letters and digits in the same location, so we hardcode that here.
429
Our EBCDIC support can also be provided via numeric literals instead of
430
character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled
431
in EBCDIC mode. */
432
433
#define ESCAPES_FIRST       CHAR_a
434
#define ESCAPES_LAST        CHAR_9
435
#define UPPER_CASE(c)       (c+64)
436
437
static const short int escapes[] = {
438
    /* 0x81 a */ CHAR_BEL,             /* 0x82 b */ -ESC_b,
439
    /* 0x83 c */ 0,                    /* 0x84 d */ -ESC_d,
440
    /* 0x85 e */ CHAR_ESC,             /* 0x86 f */ CHAR_FF,
441
    /* 0x87 g */ 0,                    /* 0x88 h */ -ESC_h,
442
    /* 0x89 i */ 0,                    /* 0x8a   */ ESCAPES_FIRST+0x09,
443
    /* 0x8b   */ ESCAPES_FIRST+0x0a,   /* 0x8c   */ ESCAPES_FIRST+0x0b,
444
    /* 0x8d   */ ESCAPES_FIRST+0x0c,   /* 0x8e   */ ESCAPES_FIRST+0x0d,
445
    /* 0x8f   */ ESCAPES_FIRST+0x0e,   /* 0x90   */ ESCAPES_FIRST+0x0f,
446
    /* 0x91 j */ 0,                    /* 0x92 k */ -ESC_k,
447
    /* 0x93 l */ 0,                    /* 0x94 m */ 0,
448
    /* 0x95 n */ CHAR_LF,              /* 0x96 o */ 0,
449
    /* 0x97 p */ -ESC_p,               /* 0x98 q */ 0,
450
    /* 0x99 r */ CHAR_CR,              /* 0x9a   */ ESCAPES_FIRST+0x19,
451
    /* 0x9b   */ ESCAPES_FIRST+0x1a,   /* 0x9c   */ ESCAPES_FIRST+0x1b,
452
    /* 0x9d   */ ESCAPES_FIRST+0x1c,   /* 0x9e   */ ESCAPES_FIRST+0x1d,
453
    /* 0x9f   */ ESCAPES_FIRST+0x1e,   /* 0xa0   */ ESCAPES_FIRST+0x1f,
454
    /* 0xa1   */ ESCAPES_FIRST+0x20,   /* 0xa2 s */ -ESC_s,
455
    /* 0xa3 t */ CHAR_HT,              /* 0xa4 u */ 0,
456
    /* 0xa5 v */ -ESC_v,               /* 0xa6 w */ -ESC_w,
457
    /* 0xa7 x */ 0,                    /* 0xa8 y */ 0,
458
    /* 0xa9 z */ -ESC_z,               /* 0xaa   */ ESCAPES_FIRST+0x29,
459
    /* 0xab   */ ESCAPES_FIRST+0x2a,   /* 0xac   */ ESCAPES_FIRST+0x2b,
460
    /* 0xad   */ ESCAPES_FIRST+0x2c,   /* 0xae   */ ESCAPES_FIRST+0x2d,
461
    /* 0xaf   */ ESCAPES_FIRST+0x2e,   /* 0xb0   */ ESCAPES_FIRST+0x2f,
462
    /* 0xb1   */ ESCAPES_FIRST+0x30,   /* 0xb2   */ ESCAPES_FIRST+0x31,
463
    /* 0xb3   */ ESCAPES_FIRST+0x32,   /* 0xb4   */ ESCAPES_FIRST+0x33,
464
    /* 0xb5   */ ESCAPES_FIRST+0x34,   /* 0xb6   */ ESCAPES_FIRST+0x35,
465
    /* 0xb7   */ ESCAPES_FIRST+0x36,   /* 0xb8   */ ESCAPES_FIRST+0x37,
466
    /* 0xb9   */ ESCAPES_FIRST+0x38,   /* 0xba   */ ESCAPES_FIRST+0x39,
467
    /* 0xbb   */ ESCAPES_FIRST+0x3a,   /* 0xbc   */ ESCAPES_FIRST+0x3b,
468
    /* 0xbd   */ ESCAPES_FIRST+0x3c,   /* 0xbe   */ ESCAPES_FIRST+0x3d,
469
    /* 0xbf   */ ESCAPES_FIRST+0x3e,   /* 0xc0   */ ESCAPES_FIRST+0x3f,
470
    /* 0xc1 A */ -ESC_A,               /* 0xc2 B */ -ESC_B,
471
    /* 0xc3 C */ -ESC_C,               /* 0xc4 D */ -ESC_D,
472
    /* 0xc5 E */ -ESC_E,               /* 0xc6 F */ 0,
473
    /* 0xc7 G */ -ESC_G,               /* 0xc8 H */ -ESC_H,
474
    /* 0xc9 I */ 0,                    /* 0xca   */ ESCAPES_FIRST+0x49,
475
    /* 0xcb   */ ESCAPES_FIRST+0x4a,   /* 0xcc   */ ESCAPES_FIRST+0x4b,
476
    /* 0xcd   */ ESCAPES_FIRST+0x4c,   /* 0xce   */ ESCAPES_FIRST+0x4d,
477
    /* 0xcf   */ ESCAPES_FIRST+0x4e,   /* 0xd0   */ ESCAPES_FIRST+0x4f,
478
    /* 0xd1 J */ 0,                    /* 0xd2 K */ -ESC_K,
479
    /* 0xd3 L */ 0,                    /* 0xd4 M */ 0,
480
    /* 0xd5 N */ -ESC_N,               /* 0xd6 O */ 0,
481
    /* 0xd7 P */ -ESC_P,               /* 0xd8 Q */ -ESC_Q,
482
    /* 0xd9 R */ -ESC_R,               /* 0xda   */ ESCAPES_FIRST+0x59,
483
    /* 0xdb   */ ESCAPES_FIRST+0x5a,   /* 0xdc   */ ESCAPES_FIRST+0x5b,
484
    /* 0xdd   */ ESCAPES_FIRST+0x5c,   /* 0xde   */ ESCAPES_FIRST+0x5d,
485
    /* 0xdf   */ ESCAPES_FIRST+0x5e,   /* 0xe0   */ ESCAPES_FIRST+0x5f,
486
    /* 0xe1   */ ESCAPES_FIRST+0x60,   /* 0xe2 S */ -ESC_S,
487
    /* 0xe3 T */ 0,                    /* 0xe4 U */ 0,
488
    /* 0xe5 V */ -ESC_V,               /* 0xe6 W */ -ESC_W,
489
    /* 0xe7 X */ -ESC_X,               /* 0xe8 Y */ 0,
490
    /* 0xe9 Z */ -ESC_Z,               /* 0xea   */ ESCAPES_FIRST+0x69,
491
    /* 0xeb   */ ESCAPES_FIRST+0x6a,   /* 0xec   */ ESCAPES_FIRST+0x6b,
492
    /* 0xed   */ ESCAPES_FIRST+0x6c,   /* 0xee   */ ESCAPES_FIRST+0x6d,
493
    /* 0xef   */ ESCAPES_FIRST+0x6e,   /* 0xf0 0 */ 0,
494
    /* 0xf1 1 */ 0,                    /* 0xf2 2 */ 0,
495
    /* 0xf3 3 */ 0,                    /* 0xf4 4 */ 0,
496
    /* 0xf5 5 */ 0,                    /* 0xf6 6 */ 0,
497
    /* 0xf7 7 */ 0,                    /* 0xf8 8 */ 0,
498
    /* 0xf9 9 */ 0,
499
};
500
501
/* We also need a table of characters that may follow \c in an EBCDIC
502
environment for characters 0-31. */
503
504
static unsigned char ebcdic_escape_c[] = {
505
  CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,
506
  CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,
507
  CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,
508
  CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
509
  CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE
510
};
511
512
#endif   /* EBCDIC */
513
514
515
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
516
searched linearly. Put all the names into a single string, in order to reduce
517
the number of relocations when a shared library is dynamically linked. The
518
string is built from string macros so that it works in UTF-8 mode on EBCDIC
519
platforms. */
520
521
typedef struct verbitem {
522
  unsigned int len;          /* Length of verb name */
523
  uint32_t meta;             /* Base META_ code */
524
  int has_arg;               /* Argument requirement */
525
} verbitem;
526
527
static const char verbnames[] =
528
  "\0"                       /* Empty name is a shorthand for MARK */
529
  STRING_MARK0
530
  STRING_ACCEPT0
531
  STRING_F0
532
  STRING_FAIL0
533
  STRING_COMMIT0
534
  STRING_PRUNE0
535
  STRING_SKIP0
536
  STRING_THEN;
537
538
static const verbitem verbs[] = {
539
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
540
  { 4, META_MARK,   +1 },
541
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
542
  { 1, META_FAIL,   -1 },
543
  { 4, META_FAIL,   -1 },
544
  { 6, META_COMMIT,  0 },
545
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
546
  { 4, META_SKIP,    0 },
547
  { 4, META_THEN,    0 }
548
};
549
550
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
551
552
/* Verb opcodes, indexed by their META code offset from META_MARK. */
553
554
static const uint32_t verbops[] = {
555
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
556
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
557
558
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
559
560
typedef struct alasitem {
561
  unsigned int len;          /* Length of name */
562
  uint32_t meta;             /* Base META_ code */
563
} alasitem;
564
565
static const char alasnames[] =
566
  STRING_pla0
567
  STRING_plb0
568
  STRING_napla0
569
  STRING_naplb0
570
  STRING_nla0
571
  STRING_nlb0
572
  STRING_positive_lookahead0
573
  STRING_positive_lookbehind0
574
  STRING_non_atomic_positive_lookahead0
575
  STRING_non_atomic_positive_lookbehind0
576
  STRING_negative_lookahead0
577
  STRING_negative_lookbehind0
578
  STRING_scs0
579
  STRING_scan_substring0
580
  STRING_atomic0
581
  STRING_sr0
582
  STRING_asr0
583
  STRING_script_run0
584
  STRING_atomic_script_run;
585
586
static const alasitem alasmeta[] = {
587
  {  3, META_LOOKAHEAD         },
588
  {  3, META_LOOKBEHIND        },
589
  {  5, META_LOOKAHEAD_NA      },
590
  {  5, META_LOOKBEHIND_NA     },
591
  {  3, META_LOOKAHEADNOT      },
592
  {  3, META_LOOKBEHINDNOT     },
593
  { 18, META_LOOKAHEAD         },
594
  { 19, META_LOOKBEHIND        },
595
  { 29, META_LOOKAHEAD_NA      },
596
  { 30, META_LOOKBEHIND_NA     },
597
  { 18, META_LOOKAHEADNOT      },
598
  { 19, META_LOOKBEHINDNOT     },
599
  {  3, META_SCS               },
600
  { 14, META_SCS               },
601
  {  6, META_ATOMIC            },
602
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
603
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
604
  { 10, META_SCRIPT_RUN        }, /* script run */
605
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
606
};
607
608
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
609
610
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
611
612
static uint32_t chartypeoffset[] = {
613
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
614
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
615
616
/* Tables of names of POSIX character classes and their lengths. The names are
617
now all in a single string, to reduce the number of relocations when a shared
618
library is dynamically loaded. The list of lengths is terminated by a zero
619
length entry. The first three must be alpha, lower, upper, as this is assumed
620
for handling case independence.
621
622
The indices for several classes are stored in pcre2_compile.h - these must
623
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
624
and posix_substitutes. */
625
626
static const char posix_names[] =
627
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
628
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
629
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
630
  STRING_word0  STRING_xdigit;
631
632
static const uint8_t posix_name_lengths[] = {
633
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
634
635
/* Table of class bit maps for each POSIX class. Each class is formed from a
636
base map, with an optional addition or removal of another map. Then, for some
637
classes, there is some additional tweaking: for [:blank:] the vertical space
638
characters are removed, and for [:alpha:] and [:alnum:] the underscore
639
character is removed. The triples in the table consist of the base map offset,
640
second map offset or -1 if no second map, and a non-negative value for map
641
addition or a negative value for map subtraction (if there are two maps). The
642
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
643
remove vertical space characters, 2 => remove underscore. */
644
645
const int PRIV(posix_class_maps)[] = {
646
  cbit_word,   cbit_digit, -2,            /* alpha */
647
  cbit_lower,  -1,          0,            /* lower */
648
  cbit_upper,  -1,          0,            /* upper */
649
  cbit_word,   -1,          2,            /* alnum - word without underscore */
650
  cbit_print,  cbit_cntrl,  0,            /* ascii */
651
  cbit_space,  -1,          1,            /* blank - a GNU extension */
652
  cbit_cntrl,  -1,          0,            /* cntrl */
653
  cbit_digit,  -1,          0,            /* digit */
654
  cbit_graph,  -1,          0,            /* graph */
655
  cbit_print,  -1,          0,            /* print */
656
  cbit_punct,  -1,          0,            /* punct */
657
  cbit_space,  -1,          0,            /* space */
658
  cbit_word,   -1,          0,            /* word - a Perl extension */
659
  cbit_xdigit, -1,          0             /* xdigit */
660
};
661
662
#ifdef SUPPORT_UNICODE
663
664
/* The POSIX class Unicode property substitutes that are used in UCP mode must
665
be in the order of the POSIX class names, defined above. */
666
667
static int posix_substitutes[] = {
668
  PT_GC, ucp_L,     /* alpha */
669
  PT_PC, ucp_Ll,    /* lower */
670
  PT_PC, ucp_Lu,    /* upper */
671
  PT_ALNUM, 0,      /* alnum */
672
  -1, 0,            /* ascii, treat as non-UCP */
673
  -1, 1,            /* blank, treat as \h */
674
  PT_PC, ucp_Cc,    /* cntrl */
675
  PT_PC, ucp_Nd,    /* digit */
676
  PT_PXGRAPH, 0,    /* graph */
677
  PT_PXPRINT, 0,    /* print */
678
  PT_PXPUNCT, 0,    /* punct */
679
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
680
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
681
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
682
};
683
#endif  /* SUPPORT_UNICODE */
684
685
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
686
are allowed. */
687
688
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
689
87.5k
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
690
87.5k
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
691
87.5k
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
692
693
#define PUBLIC_COMPILE_OPTIONS \
694
87.5k
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
695
87.5k
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
696
87.5k
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
697
87.5k
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
698
87.5k
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
699
87.5k
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
700
87.5k
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
701
702
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
703
87.5k
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
704
87.5k
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
705
706
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
707
87.5k
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
708
87.5k
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
709
87.5k
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
710
87.5k
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
711
87.5k
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
712
87.5k
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
713
87.5k
    PCRE2_EXTRA_NEVER_CALLOUT)
714
715
/* This is a table of start-of-pattern options such as (*UTF) and settings such
716
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
717
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
718
generic and always supported. */
719
720
enum { PSO_OPT,     /* Value is an option bit */
721
       PSO_XOPT,    /* Value is an xoption bit */
722
       PSO_FLG,     /* Value is a flag bit */
723
       PSO_NL,      /* Value is a newline type */
724
       PSO_BSR,     /* Value is a \R type */
725
       PSO_LIMH,    /* Read integer value for heap limit */
726
       PSO_LIMM,    /* Read integer value for match limit */
727
       PSO_LIMD,    /* Read integer value for depth limit */
728
       PSO_OPTMZ    /* Value is an optimization bit */
729
     };
730
731
typedef struct pso {
732
  const char *name;
733
  uint16_t length;
734
  uint16_t type;
735
  uint32_t value;
736
} pso;
737
738
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
739
740
static const pso pso_list[] = {
741
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
742
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
743
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
744
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
745
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
746
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
747
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
748
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
749
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
750
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
751
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
752
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
753
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
754
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
755
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
756
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
757
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
758
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
759
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
760
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
761
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
762
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
763
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
764
};
765
766
/* This table is used when converting repeating opcodes into possessified
767
versions as a result of an explicit possessive quantifier such as ++. A zero
768
value means there is no possessified version - in those cases the item in
769
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
770
because all relevant opcodes are less than that. */
771
772
static const uint8_t opcode_possessify[] = {
773
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
774
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
775
776
  0,                       /* NOTI */
777
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
778
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
779
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
780
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
781
  0,                       /* EXACT */
782
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
783
784
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
785
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
786
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
787
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
788
  0,                       /* EXACTI */
789
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
790
791
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
792
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
793
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
794
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
795
  0,                       /* NOTEXACT */
796
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
797
798
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
799
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
800
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
801
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
802
  0,                       /* NOTEXACTI */
803
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
804
805
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
806
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
807
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
808
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
809
  0,                       /* TYPEEXACT */
810
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
811
812
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
813
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
814
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
815
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
816
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
817
818
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
819
  0, 0,                    /* REF, REFI */
820
  0, 0,                    /* DNREF, DNREFI */
821
  0, 0,                    /* RECURSE, CALLOUT */
822
};
823
824
/* Compile-time check that the table has the correct size. */
825
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
826
827
828
#ifdef DEBUG_SHOW_PARSED
829
/*************************************************
830
*     Show the parsed pattern for debugging      *
831
*************************************************/
832
833
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
834
can be enabled. */
835
836
static void show_parsed(compile_block *cb)
837
{
838
uint32_t *pptr = cb->parsed_pattern;
839
840
for (;;)
841
  {
842
  int max, min;
843
  PCRE2_SIZE offset;
844
  uint32_t i;
845
  uint32_t length;
846
  uint32_t meta_arg = META_DATA(*pptr);
847
848
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
849
850
  if (*pptr < META_END)
851
    {
852
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
853
    pptr++;
854
    }
855
856
  else switch (META_CODE(*pptr++))
857
    {
858
    default:
859
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
860
    return;
861
862
    case META_END:
863
    fprintf(stderr, "META_END\n");
864
    return;
865
866
    case META_CAPTURE:
867
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
868
    break;
869
870
    case META_RECURSE:
871
    GETOFFSET(offset, pptr);
872
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
873
    break;
874
875
    case META_BACKREF:
876
    if (meta_arg < 10)
877
      offset = cb->small_ref_offset[meta_arg];
878
    else
879
      GETOFFSET(offset, pptr);
880
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
881
    break;
882
883
    case META_ESCAPE:
884
    if (meta_arg == ESC_P || meta_arg == ESC_p)
885
      {
886
      uint32_t ptype = *pptr >> 16;
887
      uint32_t pvalue = *pptr++ & 0xffff;
888
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
889
        ptype, pvalue);
890
      }
891
    else
892
      {
893
      uint32_t cc;
894
      /* There's just one escape we might have here that isn't negated in the
895
      escapes table. */
896
      if (meta_arg == ESC_g) cc = CHAR_g;
897
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
898
        {
899
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
900
        }
901
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
902
      fprintf(stderr, "META \\%c", cc);
903
      }
904
    break;
905
906
    case META_MINMAX:
907
    min = *pptr++;
908
    max = *pptr++;
909
    if (max != REPEAT_UNLIMITED)
910
      fprintf(stderr, "META {%d,%d}", min, max);
911
    else
912
      fprintf(stderr, "META {%d,}", min);
913
    break;
914
915
    case META_MINMAX_QUERY:
916
    min = *pptr++;
917
    max = *pptr++;
918
    if (max != REPEAT_UNLIMITED)
919
      fprintf(stderr, "META {%d,%d}?", min, max);
920
    else
921
      fprintf(stderr, "META {%d,}?", min);
922
    break;
923
924
    case META_MINMAX_PLUS:
925
    min = *pptr++;
926
    max = *pptr++;
927
    if (max != REPEAT_UNLIMITED)
928
      fprintf(stderr, "META {%d,%d}+", min, max);
929
    else
930
      fprintf(stderr, "META {%d,}+", min);
931
    break;
932
933
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
934
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
935
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
936
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
937
    case META_DOT: fprintf(stderr, "META_DOT"); break;
938
    case META_ASTERISK: fprintf(stderr, "META *"); break;
939
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
940
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
941
    case META_PLUS: fprintf(stderr, "META +"); break;
942
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
943
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
944
    case META_QUERY: fprintf(stderr, "META ?"); break;
945
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
946
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
947
948
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
949
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
950
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
951
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
952
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
953
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
954
    case META_KET: fprintf(stderr, "META )"); break;
955
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
956
957
    case META_CLASS: fprintf(stderr, "META ["); break;
958
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
959
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
960
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
961
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
962
963
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
964
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
965
966
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
967
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
968
969
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
970
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
971
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
972
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
973
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
974
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
975
976
    case META_OPTIONS:
977
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
978
    pptr += 2;
979
    break;
980
981
    case META_LOOKBEHIND:
982
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
983
    pptr += 2;
984
    break;
985
986
    case META_LOOKBEHIND_NA:
987
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
988
    pptr += 2;
989
    break;
990
991
    case META_LOOKBEHINDNOT:
992
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
993
    pptr += 2;
994
    break;
995
996
    case META_CALLOUT_NUMBER:
997
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
998
       pptr[1]);
999
    pptr += 3;
1000
    break;
1001
1002
    case META_CALLOUT_STRING:
1003
      {
1004
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1005
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1006
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1007
      GETOFFSET(offset, pptr);
1008
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1009
      }
1010
    break;
1011
1012
    case META_RECURSE_BYNAME:
1013
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1014
    GETOFFSET(offset, pptr);
1015
    fprintf(stderr, "%zd", offset);
1016
    break;
1017
1018
    case META_BACKREF_BYNAME:
1019
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1020
    GETOFFSET(offset, pptr);
1021
    fprintf(stderr, "%zd", offset);
1022
    break;
1023
1024
    case META_COND_NUMBER:
1025
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    pptr++;
1029
    break;
1030
1031
    case META_COND_DEFINE:
1032
    fprintf(stderr, "META (?(DEFINE) offset=");
1033
    GETOFFSET(offset, pptr);
1034
    fprintf(stderr, "%zd", offset);
1035
    break;
1036
1037
    case META_COND_VERSION:
1038
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1039
    fprintf(stderr, "%d.", *pptr++);
1040
    fprintf(stderr, "%d)", *pptr++);
1041
    break;
1042
1043
    case META_COND_NAME:
1044
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1045
    GETOFFSET(offset, pptr);
1046
    fprintf(stderr, "%zd", offset);
1047
    break;
1048
1049
    case META_COND_RNAME:
1050
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1051
    GETOFFSET(offset, pptr);
1052
    fprintf(stderr, "%zd", offset);
1053
    break;
1054
1055
    /* This is kept as a name, because it might be. */
1056
1057
    case META_COND_RNUMBER:
1058
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1059
    GETOFFSET(offset, pptr);
1060
    fprintf(stderr, "%zd", offset);
1061
    break;
1062
1063
    case META_OFFSET:
1064
    fprintf(stderr, "META_OFFSET offset=");
1065
    GETOFFSET(offset, pptr);
1066
    fprintf(stderr, "%zd", offset);
1067
    break;
1068
1069
    case META_SCS:
1070
    fprintf(stderr, "META (*scan_substring:");
1071
    break;
1072
1073
    case META_CAPTURE_NAME:
1074
    fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1075
    break;
1076
1077
    case META_CAPTURE_NUMBER:
1078
    fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1079
    break;
1080
1081
    case META_MARK:
1082
    fprintf(stderr, "META (*MARK:");
1083
    goto SHOWARG;
1084
1085
    case META_COMMIT_ARG:
1086
    fprintf(stderr, "META (*COMMIT:");
1087
    goto SHOWARG;
1088
1089
    case META_PRUNE_ARG:
1090
    fprintf(stderr, "META (*PRUNE:");
1091
    goto SHOWARG;
1092
1093
    case META_SKIP_ARG:
1094
    fprintf(stderr, "META (*SKIP:");
1095
    goto SHOWARG;
1096
1097
    case META_THEN_ARG:
1098
    fprintf(stderr, "META (*THEN:");
1099
    SHOWARG:
1100
    length = *pptr++;
1101
    for (i = 0; i < length; i++)
1102
      {
1103
      uint32_t cc = *pptr++;
1104
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1105
        else fprintf(stderr, "\\x{%x}", cc);
1106
      }
1107
    fprintf(stderr, ") length=%u", length);
1108
    break;
1109
1110
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1111
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1112
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1113
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1114
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1115
    }
1116
  fprintf(stderr, "\n");
1117
  }
1118
return;
1119
}
1120
#endif  /* DEBUG_SHOW_PARSED */
1121
1122
1123
1124
/*************************************************
1125
*               Copy compiled code               *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. */
1130
1131
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1132
pcre2_code_copy(const pcre2_code *code)
1133
0
{
1134
0
PCRE2_SIZE *ref_count;
1135
0
pcre2_code *newcode;
1136
1137
0
if (code == NULL) return NULL;
1138
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1139
0
if (newcode == NULL) return NULL;
1140
0
memcpy(newcode, code, code->blocksize);
1141
0
newcode->executable_jit = NULL;
1142
1143
/* If the code is one that has been deserialized, increment the reference count
1144
in the decoded tables. */
1145
1146
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1147
0
  {
1148
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1149
0
  (*ref_count)++;
1150
0
  }
1151
1152
0
return newcode;
1153
0
}
1154
1155
1156
1157
/*************************************************
1158
*     Copy compiled code and character tables    *
1159
*************************************************/
1160
1161
/* Compiled JIT code cannot be copied, so the new compiled block has no
1162
associated JIT data. This version of code_copy also makes a separate copy of
1163
the character tables. */
1164
1165
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1166
pcre2_code_copy_with_tables(const pcre2_code *code)
1167
0
{
1168
0
PCRE2_SIZE* ref_count;
1169
0
pcre2_code *newcode;
1170
0
uint8_t *newtables;
1171
1172
0
if (code == NULL) return NULL;
1173
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1174
0
if (newcode == NULL) return NULL;
1175
0
memcpy(newcode, code, code->blocksize);
1176
0
newcode->executable_jit = NULL;
1177
1178
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1179
0
  code->memctl.memory_data);
1180
0
if (newtables == NULL)
1181
0
  {
1182
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1183
0
  return NULL;
1184
0
  }
1185
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1186
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1187
0
*ref_count = 1;
1188
1189
0
newcode->tables = newtables;
1190
0
newcode->flags |= PCRE2_DEREF_TABLES;
1191
0
return newcode;
1192
0
}
1193
1194
1195
1196
/*************************************************
1197
*               Free compiled code               *
1198
*************************************************/
1199
1200
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1201
pcre2_code_free(pcre2_code *code)
1202
87.5k
{
1203
87.5k
PCRE2_SIZE* ref_count;
1204
1205
87.5k
if (code != NULL)
1206
75.8k
  {
1207
75.8k
#ifdef SUPPORT_JIT
1208
75.8k
  if (code->executable_jit != NULL)
1209
70.4k
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1210
75.8k
#endif
1211
1212
75.8k
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1213
0
    {
1214
    /* Decoded tables belong to the codes after deserialization, and they must
1215
    be freed when there are no more references to them. The *ref_count should
1216
    always be > 0. */
1217
1218
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1219
0
    if (*ref_count > 0)
1220
0
      {
1221
0
      (*ref_count)--;
1222
0
      if (*ref_count == 0)
1223
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1224
0
      }
1225
0
    }
1226
1227
75.8k
  code->memctl.free(code, code->memctl.memory_data);
1228
75.8k
  }
1229
87.5k
}
1230
1231
1232
1233
/*************************************************
1234
*         Read a number, possibly signed         *
1235
*************************************************/
1236
1237
/* This function is used to read numbers in the pattern. The initial pointer
1238
must be at the sign or first digit of the number. When relative values
1239
(introduced by + or -) are allowed, they are relative group numbers, and the
1240
result must be greater than zero.
1241
1242
Arguments:
1243
  ptrptr      points to the character pointer variable
1244
  ptrend      points to the end of the input string
1245
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1246
  max_value   the largest number allowed;
1247
              you must not pass a value for max_value larger than
1248
              INT_MAX/10 - 1 because this function relies on max_value to
1249
              avoid integer overflow
1250
  max_error   the error to give for an over-large number
1251
  intptr      where to put the result
1252
  errcodeptr  where to put an error code
1253
1254
Returns:      TRUE  - a number was read
1255
              FALSE - errorcode == 0 => no number was found
1256
                      errorcode != 0 => an error occurred
1257
*/
1258
1259
static BOOL
1260
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1261
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1262
354k
{
1263
354k
int sign = 0;
1264
354k
uint32_t n = 0;
1265
354k
PCRE2_SPTR ptr = *ptrptr;
1266
354k
BOOL yield = FALSE;
1267
1268
354k
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1269
1270
354k
*errorcodeptr = 0;
1271
1272
354k
if (allow_sign >= 0 && ptr < ptrend)
1273
66.5k
  {
1274
66.5k
  if (*ptr == CHAR_PLUS)
1275
45.9k
    {
1276
45.9k
    sign = +1;
1277
45.9k
    max_value -= allow_sign;
1278
45.9k
    ptr++;
1279
45.9k
    }
1280
20.6k
  else if (*ptr == CHAR_MINUS)
1281
2.88k
    {
1282
2.88k
    sign = -1;
1283
2.88k
    ptr++;
1284
2.88k
    }
1285
66.5k
  }
1286
1287
354k
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1288
677k
while (ptr < ptrend && IS_DIGIT(*ptr))
1289
400k
  {
1290
400k
  n = n * 10 + (*ptr++ - CHAR_0);
1291
400k
  if (n > max_value)
1292
2.72k
    {
1293
2.72k
    *errorcodeptr = max_error;
1294
18.2k
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1295
2.72k
    goto EXIT;
1296
2.72k
    }
1297
400k
  }
1298
1299
277k
if (allow_sign >= 0 && sign != 0)
1300
48.7k
  {
1301
48.7k
  if (n == 0)
1302
13
    {
1303
13
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1304
13
    goto EXIT;
1305
13
    }
1306
1307
48.7k
  if (sign > 0) n += allow_sign;
1308
2.87k
  else if (n > (uint32_t)allow_sign)
1309
29
    {
1310
29
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1311
29
    goto EXIT;
1312
29
    }
1313
2.84k
  else n = allow_sign + 1 - n;
1314
48.7k
  }
1315
1316
277k
yield = TRUE;
1317
1318
280k
EXIT:
1319
280k
*intptr = n;
1320
280k
*ptrptr = ptr;
1321
280k
return yield;
1322
277k
}
1323
1324
1325
1326
/*************************************************
1327
*         Read repeat counts                     *
1328
*************************************************/
1329
1330
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1331
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1332
larger value is used for "unlimited". We have to use signed arguments for
1333
read_number() because it is capable of returning a signed value. As of Perl
1334
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1335
tabs after { and before } and between the numbers and the comma, so we do too.
1336
1337
Arguments:
1338
  ptrptr         points to pointer to character after '{'
1339
  ptrend         pointer to end of input
1340
  minp           if not NULL, pointer to int for min
1341
  maxp           if not NULL, pointer to int for max
1342
  errorcodeptr   points to error code variable
1343
1344
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1345
                 FALSE on error, with errorcode set non-zero
1346
                 TRUE on success, with pointer updated to point after '}'
1347
*/
1348
1349
static BOOL
1350
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1351
  uint32_t *maxp, int *errorcodeptr)
1352
343k
{
1353
343k
PCRE2_SPTR p = *ptrptr;
1354
343k
PCRE2_SPTR pp;
1355
343k
BOOL yield = FALSE;
1356
343k
BOOL had_minimum = FALSE;
1357
343k
int32_t min = 0;
1358
343k
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1359
1360
343k
*errorcodeptr = 0;
1361
344k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1362
1363
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1364
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1365
error. */
1366
1367
343k
pp = p;
1368
343k
if (pp < ptrend && IS_DIGIT(*pp))
1369
163k
  {
1370
163k
  had_minimum = TRUE;
1371
262k
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1372
163k
  }
1373
1374
356k
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1375
343k
if (pp >= ptrend) return FALSE;
1376
1377
341k
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1378
91.2k
  {
1379
91.2k
  if (!had_minimum) return FALSE;
1380
91.2k
  }
1381
250k
else
1382
250k
  {
1383
250k
  if (*pp++ != CHAR_COMMA) return FALSE;
1384
106k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1385
103k
  if (pp >= ptrend) return FALSE;
1386
103k
  if (IS_DIGIT(*pp))
1387
71.7k
    {
1388
95.8k
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1389
71.7k
    }
1390
31.9k
  else if (!had_minimum) return FALSE;
1391
120k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1392
95.9k
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1393
95.9k
  }
1394
1395
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1396
or {n,m}. The only error that read_number() can return is for a number that is
1397
too big. If *errorcodeptr is returned as zero it means no number was found. */
1398
1399
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1400
check m >= n because n defaults to zero. */
1401
1402
167k
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1403
46.4k
  {
1404
46.4k
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1405
46.3k
  p++;  /* Skip comma and subsequent spaces */
1406
46.9k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1407
46.3k
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1408
31
    {
1409
31
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1410
31
    }
1411
46.3k
  }
1412
1413
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1414
1415
121k
else
1416
121k
  {
1417
132k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1418
121k
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1419
87.6k
    {
1420
87.6k
    max = min;
1421
87.6k
    }
1422
33.4k
  else   /* Handle {n,} or {n,m} */
1423
33.4k
    {
1424
33.4k
    p++;    /* Skip comma and subsequent spaces */
1425
34.7k
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1426
33.4k
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1427
15.6k
      {
1428
15.6k
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1429
15.6k
      }
1430
1431
33.3k
    if (max < min)
1432
19
      {
1433
19
      *errorcodeptr = ERR4;
1434
19
      goto EXIT;
1435
19
      }
1436
33.3k
    }
1437
121k
  }
1438
1439
/* Valid quantifier exists */
1440
1441
190k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1442
167k
p++;
1443
167k
yield = TRUE;
1444
167k
if (minp != NULL) *minp = (uint32_t)min;
1445
167k
if (maxp != NULL) *maxp = (uint32_t)max;
1446
1447
/* Update the pattern pointer */
1448
1449
167k
EXIT:
1450
167k
*ptrptr = p;
1451
167k
return yield;
1452
167k
}
1453
1454
1455
1456
/*************************************************
1457
*            Handle escapes                      *
1458
*************************************************/
1459
1460
/* This function is called when a \ has been encountered. It either returns a
1461
positive value for a simple escape such as \d, or 0 for a data character, which
1462
is placed in chptr. A backreference to group n is returned as -(n+1). On
1463
entry, ptr is pointing at the character after \. On exit, it points after the
1464
final code unit of the escape sequence.
1465
1466
This function is also called from pcre2_substitute() to handle escape sequences
1467
in replacement strings. In this case, the cb argument is NULL, and in the case
1468
of escapes that have further processing, only sequences that define a data
1469
character are recognised. The options argument is the final value of the
1470
compiled pattern's options.
1471
1472
Arguments:
1473
  ptrptr         points to the input position pointer
1474
  ptrend         points to the end of the input
1475
  chptr          points to a returned data character
1476
  errorcodeptr   points to the errorcode variable (containing zero)
1477
  options        the current options bits
1478
  xoptions       the current extra options bits
1479
  bracount       the number of capturing parentheses encountered so far
1480
  isclass        TRUE if in a character class
1481
  cb             compile data block or NULL when called from pcre2_substitute()
1482
1483
Returns:         zero => a data character
1484
                 positive => a special escape sequence
1485
                 negative => a numerical back reference
1486
                 on error, errorcodeptr is set non-zero
1487
*/
1488
1489
int
1490
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1491
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1492
  BOOL isclass, compile_block *cb)
1493
538k
{
1494
538k
BOOL utf = (options & PCRE2_UTF) != 0;
1495
538k
BOOL alt_bsux =
1496
538k
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1497
538k
PCRE2_SPTR ptr = *ptrptr;
1498
538k
uint32_t c, cc;
1499
538k
int escape = 0;
1500
538k
int i;
1501
1502
/* If backslash is at the end of the string, it's an error. */
1503
1504
538k
if (ptr >= ptrend)
1505
84
  {
1506
84
  *errorcodeptr = ERR1;
1507
84
  return 0;
1508
84
  }
1509
1510
538k
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1511
538k
*errorcodeptr = 0;              /* Be optimistic */
1512
1513
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1514
value test saves a memory lookup for code points outside the alphanumeric
1515
range. */
1516
1517
538k
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1518
1519
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1520
positive value is a literal value for something like \n. A negative value is
1521
the negation of one of the ESC_ macros that is passed back for handling by the
1522
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1523
is supported. If the value is zero, further processing is handled below. */
1524
1525
442k
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1526
408k
  {
1527
408k
  if (i > 0)
1528
62.9k
    {
1529
62.9k
    c = (uint32_t)i;
1530
62.9k
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1531
0
      c = CHAR_LF;
1532
62.9k
    }
1533
345k
  else  /* Negative table entry */
1534
345k
    {
1535
345k
    escape = -i;                    /* Else return a special escape */
1536
345k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1537
27.1k
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1538
1539
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1540
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1541
    support \N{name}. However, it does support quantification such as \N{2,3},
1542
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1543
1544
345k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1545
489
      {
1546
489
      PCRE2_SPTR p = ptr + 1;
1547
1548
      /* Perl ignores spaces and tabs after { */
1549
1550
888
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1551
1552
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1553
      not valid in EBCDIC environments because it specifies a Unicode
1554
      character, not a codepoint in the local code. For example \N{U+0041}
1555
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1556
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1557
      Unicode) mode. */
1558
1559
489
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1560
132
        {
1561
132
#ifndef EBCDIC
1562
132
        if (utf)
1563
78
          {
1564
78
          ptr = p + 2;
1565
78
          escape = 0;   /* Not a fancy escape after all */
1566
78
          goto COME_FROM_NU;
1567
78
          }
1568
54
#endif
1569
1570
        /* Improve error offset. */
1571
54
        ptr = p + 2;
1572
297
        while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1573
486
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1574
54
        if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET) ptr++;
1575
1576
54
        *errorcodeptr = ERR93;
1577
54
        }
1578
1579
      /* Give an error in contexts where quantifiers are not allowed
1580
      (character classes; substitution strings). */
1581
1582
357
      else if (isclass || cb == NULL)
1583
3
        {
1584
3
        ptr++; /* Skip over the opening brace */
1585
3
        *errorcodeptr = ERR37;
1586
3
        }
1587
1588
      /* Give an error if what follows is not a quantifier, but don't override
1589
      an error set by the quantifier reader (e.g. number overflow). */
1590
1591
354
      else
1592
354
        {
1593
354
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1594
73
             *errorcodeptr == 0)
1595
56
          {
1596
56
          ptr++; /* Skip over the opening brace */
1597
56
          *errorcodeptr = ERR37;
1598
56
          }
1599
354
        }
1600
489
      }
1601
345k
    }
1602
408k
  }
1603
1604
/* Escapes that need further processing, including those that are unknown, have
1605
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1606
\o, and \x are recognized (\u and \U can never appear as they are used for case
1607
forcing). */
1608
1609
33.9k
else
1610
33.9k
  {
1611
33.9k
  int s;
1612
33.9k
  PCRE2_SPTR oldptr;
1613
33.9k
  BOOL overflow;
1614
1615
  /* Filter calls from pcre2_substitute(). */
1616
1617
33.9k
  if (cb == NULL)
1618
0
    {
1619
0
    if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&
1620
0
        c != CHAR_x && c != CHAR_g)
1621
0
      {
1622
0
      *errorcodeptr = ERR3;
1623
0
      goto EXIT;
1624
0
      }
1625
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1626
0
    }
1627
1628
33.9k
  switch (c)
1629
33.9k
    {
1630
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1631
    error. */
1632
1633
4
    case CHAR_F:
1634
7
    case CHAR_l:
1635
10
    case CHAR_L:
1636
10
    *errorcodeptr = ERR37;
1637
10
    break;
1638
1639
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1640
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1641
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1642
    Otherwise it is a lowercase u letter. This gives some compatibility with
1643
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1644
    allowed. When \u{ is not followed by hex digits, a special return is given
1645
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1646
1647
3.64k
    case CHAR_u:
1648
3.64k
    if (!alt_bsux)
1649
121
      *errorcodeptr = ERR37;
1650
3.52k
    else
1651
3.52k
      {
1652
3.52k
      uint32_t xc;
1653
1654
3.52k
      if (ptr >= ptrend) break;
1655
3.52k
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1656
221
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1657
0
        {
1658
0
        PCRE2_SPTR hptr = ptr + 1;
1659
1660
0
        cc = 0;
1661
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1662
0
          {
1663
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1664
0
            {
1665
0
            *errorcodeptr = ERR77;
1666
0
            ptr = hptr;   /* Show where */
1667
0
            break;        /* *hptr != } will cause another break below */
1668
0
            }
1669
0
          cc = (cc << 4) | xc;
1670
0
          hptr++;
1671
0
          }
1672
1673
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1674
0
            hptr >= ptrend ||    /* Hit end of input */
1675
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1676
0
          {
1677
0
          if (isclass) break; /* In a class, just treat as '\u' literal */
1678
0
          escape = ESC_ub;    /* Special return */
1679
0
          ptr++;              /* Skip { */
1680
0
          break;              /* Hex escape not recognized */
1681
0
          }
1682
1683
0
        c = cc;          /* Accept the code point */
1684
0
        ptr = hptr + 1;
1685
0
        }
1686
1687
3.52k
      else  /* Must be exactly 4 hex digits */
1688
3.52k
        {
1689
3.52k
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1690
3.50k
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1691
2.95k
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1692
2.45k
        cc = (cc << 4) | xc;
1693
2.45k
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1694
1.92k
        cc = (cc << 4) | xc;
1695
1.92k
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1696
1.52k
        c = (cc << 4) | xc;
1697
1.52k
        ptr += 4;
1698
1.52k
        }
1699
1700
1.52k
      if (utf)
1701
1.33k
        {
1702
1.33k
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1703
1.33k
        else
1704
1.33k
          if (c >= 0xd800 && c <= 0xdfff &&
1705
4
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1706
4
                *errorcodeptr = ERR73;
1707
1.33k
        }
1708
198
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1709
1.52k
      }
1710
1.64k
    break;
1711
1712
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1713
    in which case it is an upper case letter. */
1714
1715
1.64k
    case CHAR_U:
1716
257
    if (!alt_bsux) *errorcodeptr = ERR37;
1717
257
    break;
1718
1719
    /* In a character class, \g is just a literal "g". Outside a character
1720
    class, \g must be followed by one of a number of specific things:
1721
1722
    (1) A number, either plain or braced. If positive, it is an absolute
1723
    backreference. If negative, it is a relative backreference. This is a Perl
1724
    5.10 feature.
1725
1726
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1727
    is part of Perl's movement towards a unified syntax for back references. As
1728
    this is synonymous with \k{name}, we fudge it up by pretending it really
1729
    was \k{name}.
1730
1731
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1732
    number either in angle brackets or in single quotes. However, these are
1733
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1734
    the ESC_g code.
1735
1736
    Summary: Return a negative number for a numerical back reference (offset
1737
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1738
    numbered subroutine call.
1739
1740
    The above describes the \g behaviour inside patterns. Inside replacement
1741
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1742
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1743
    numbered case.
1744
    */
1745
1746
4.48k
    case CHAR_g:
1747
4.48k
    if (isclass) break;
1748
1749
4.23k
    if (ptr >= ptrend)
1750
4
      {
1751
4
      *errorcodeptr = ERR57;
1752
4
      break;
1753
4
      }
1754
1755
4.23k
    if (cb == NULL)
1756
0
      {
1757
0
      PCRE2_SPTR p;
1758
      /* Substitution strings */
1759
0
      if (*ptr != CHAR_LESS_THAN_SIGN)
1760
0
        {
1761
0
        *errorcodeptr = ERR57;
1762
0
        break;
1763
0
        }
1764
1765
0
      p = ptr + 1;
1766
1767
0
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1768
0
          errorcodeptr))
1769
0
        {
1770
0
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1771
0
        break;
1772
0
        }
1773
1774
0
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1775
0
        {
1776
0
        ptr = p;
1777
0
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1778
0
        break;
1779
0
        }
1780
1781
      /* This is the reason that back references are returned as -(s+1) rather
1782
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1783
      valid in a substitution string, so this must be representable. */
1784
0
      ptr = p + 1;
1785
0
      escape = -(s+1);
1786
0
      break;
1787
0
      }
1788
1789
4.23k
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
752
      {
1791
752
      escape = ESC_g;
1792
752
      break;
1793
752
      }
1794
1795
    /* If there is a brace delimiter, try to read a numerical reference. If
1796
    there isn't one, assume we have a name and treat it as \k. */
1797
1798
3.48k
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
437
      {
1800
437
      PCRE2_SPTR p = ptr + 1;
1801
1802
944
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
437
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
437
          errorcodeptr))
1805
163
        {
1806
163
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807
163
        break;
1808
163
        }
1809
1.20k
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
274
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
32
        {
1813
32
        ptr = p;
1814
32
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1815
32
        break;
1816
32
        }
1817
242
      ptr = p + 1;
1818
242
      }
1819
1820
    /* Read an undelimited number */
1821
1822
3.04k
    else
1823
3.04k
      {
1824
3.04k
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1825
3.04k
          errorcodeptr))
1826
19
        {
1827
19
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1828
19
        break;
1829
19
        }
1830
3.04k
      }
1831
1832
3.26k
    if (s <= 0)
1833
3
      {
1834
3
      *errorcodeptr = ERR15;
1835
3
      break;
1836
3
      }
1837
1838
3.26k
    escape = -(s+1);
1839
3.26k
    break;
1840
1841
    /* The handling of escape sequences consisting of a string of digits
1842
    starting with one that is not zero is not straightforward. Perl has changed
1843
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1844
    recommended to avoid the ambiguities in the old syntax.
1845
1846
    Outside a character class, the digits are read as a decimal number. If the
1847
    number is less than 10, or if there are that many previous extracting left
1848
    brackets, it is a back reference. Otherwise, up to three octal digits are
1849
    read to form an escaped character code. Thus \123 is likely to be octal 123
1850
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1851
    style" of handling ambiguous octal/backrefences such as \12.
1852
1853
    There is an alternative disambiguation strategy, selected by
1854
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1855
    have either a leading zero, or exactly three octal digits; otherwise it's
1856
    a backreference. The disambiguation is stable, and does not depend on how
1857
    many capture groups are defined (it's simply an invalid backreference if
1858
    there is no corresponding capture group). Additionally, octal values above
1859
    \377 (\xff) are rejected.
1860
1861
    Inside a character class, \ followed by a digit is always either a literal
1862
    8 or 9 or an octal number. */
1863
1864
17.0k
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1865
19.3k
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1866
1867
19.3k
    if (isclass)
1868
2.70k
      {
1869
      /* Fall through to octal handling; never a backreference inside a class. */
1870
2.70k
      }
1871
16.6k
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1872
0
      {
1873
      /* Python-style disambiguation. */
1874
0
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1875
0
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1876
0
        {
1877
        /* We peeked a three-digit octal, so fall through */
1878
0
        }
1879
0
      else
1880
0
        {
1881
        /* We are at a digit, so the only possible error from read_number() is
1882
        a number that is too large. */
1883
0
        ptr--;   /* Back to the digit */
1884
1885
0
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1886
0
          {
1887
0
          *errorcodeptr = ERR61;
1888
0
          break;
1889
0
          }
1890
1891
0
        escape = -(s+1);
1892
0
        break;
1893
0
        }
1894
0
      }
1895
16.6k
    else
1896
16.6k
      {
1897
      /* Perl-style disambiguation. */
1898
16.6k
      oldptr = ptr;
1899
16.6k
      ptr--;   /* Back to the digit */
1900
1901
      /* As we know we are at a digit, the only possible error from
1902
      read_number() is a number that is too large to be a group number. Because
1903
      that number might be still valid if read as an octal, errorcodeptr is not
1904
      set on failure and therefore a sentinel value of INT_MAX is used instead
1905
      of the original value, and will be used later to properly set the error,
1906
      if not falling through. */
1907
1908
16.6k
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1909
2.57k
        s = INT_MAX;
1910
1911
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1912
      are octal escapes if there are not that many previous captures. */
1913
1914
16.6k
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1915
12.4k
        {
1916
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1917
        but we keep it just to be safe and because it will also catch the
1918
        sentinel value that was set on failure by that function. */
1919
1920
12.4k
        if ((unsigned)s > MAX_GROUP_NUMBER)
1921
7
          {
1922
7
          PCRE2_ASSERT(s == INT_MAX);
1923
7
          *errorcodeptr = ERR61;
1924
7
          }
1925
12.4k
        else escape = -(s+1);     /* Indicates a back reference */
1926
12.4k
        break;
1927
12.4k
        }
1928
1929
4.19k
      ptr = oldptr;      /* Put the pointer back and fall through */
1930
4.19k
      }
1931
1932
    /* Handle a digit following \ when the number is not a back reference, or
1933
    we are within a character class. If the first digit is 8 or 9, Perl used to
1934
    generate a binary zero and then treat the digit as a following literal. At
1935
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1936
1937
6.89k
    if (c >= CHAR_8) break;
1938
1939
6.57k
    PCRE2_FALLTHROUGH /* Fall through */
1940
6.57k
1941
6.57k
    /* \0 always starts an octal number, but we may drop through to here with a
1942
6.57k
    larger first octal digit. The original code used just to take the least
1943
6.57k
    significant 8 bits of octal numbers (I think this is what early Perls used
1944
6.57k
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1945
6.57k
    but no more than 3 octal digits. */
1946
6.57k
1947
8.00k
    case CHAR_0:
1948
8.00k
    c -= CHAR_0;
1949
15.7k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1950
7.73k
        c = c * 8 + *ptr++ - CHAR_0;
1951
8.00k
    if (c > 0xff)
1952
165
      {
1953
165
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1954
165
#if PCRE2_CODE_UNIT_WIDTH == 8
1955
165
      else if (!utf) *errorcodeptr = ERR51;
1956
165
#endif
1957
165
      }
1958
1959
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1960
    two- or three-character octal escapes \00 and \000, nor \x00. */
1961
1962
8.00k
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1963
0
        *errorcodeptr = ERR98;
1964
8.00k
    break;
1965
1966
    /* \o is a relatively new Perl feature, supporting a more general way of
1967
    specifying character codes in octal. The only supported form is \o{ddd},
1968
    with optional spaces or tabs after { and before }. */
1969
1970
1.24k
    case CHAR_o:
1971
1.24k
    if (ptr >= ptrend || *ptr != CHAR_LEFT_CURLY_BRACKET)
1972
11
      {
1973
11
      *errorcodeptr = ERR55;
1974
11
      break;
1975
11
      }
1976
1.23k
    ptr++;
1977
1978
1.63k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1979
1.23k
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1980
20
      {
1981
20
      *errorcodeptr = ERR78;
1982
20
      break;
1983
20
      }
1984
1985
1.21k
    c = 0;
1986
1.21k
    overflow = FALSE;
1987
3.36k
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1988
2.19k
      {
1989
2.19k
      cc = *ptr++;
1990
2.19k
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1991
#if PCRE2_CODE_UNIT_WIDTH == 32
1992
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1993
#endif
1994
1.77k
      c = (c << 3) + (cc - CHAR_0);
1995
1.77k
#if PCRE2_CODE_UNIT_WIDTH == 8
1996
1.77k
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1997
#elif PCRE2_CODE_UNIT_WIDTH == 16
1998
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1999
#elif PCRE2_CODE_UNIT_WIDTH == 32
2000
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2001
#endif
2002
1.77k
      }
2003
2004
5.23k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2005
2006
1.21k
    if (overflow)
2007
52
      {
2008
340
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2009
52
      *errorcodeptr = ERR34;
2010
52
      }
2011
1.16k
    else if (utf && c >= 0xd800 && c <= 0xdfff &&
2012
3
             (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2013
3
      {
2014
3
      *errorcodeptr = ERR73;
2015
3
      }
2016
1.16k
    else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2017
1.06k
      {
2018
1.06k
      ptr++;
2019
1.06k
      }
2020
99
    else
2021
99
      {
2022
99
      *errorcodeptr = ERR64;
2023
99
      goto ESCAPE_FAILED_FORWARD;
2024
99
      }
2025
1.11k
    break;
2026
2027
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
2028
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
2029
2030
2.39k
    case CHAR_x:
2031
2.39k
    if (alt_bsux)
2032
680
      {
2033
680
      uint32_t xc;
2034
680
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
2035
674
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
2036
435
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2037
223
      c = (cc << 4) | xc;
2038
223
      ptr += 2;
2039
223
      }
2040
2041
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
2042
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2043
    digits. If not, { used to be treated as a data character. However, Perl
2044
    seems to read hex digits up to the first non-such, and ignore the rest, so
2045
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2046
    now gives an error. */
2047
2048
1.71k
    else
2049
1.71k
      {
2050
1.71k
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2051
920
        {
2052
920
        ptr++;
2053
1.32k
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2054
2055
920
#ifndef EBCDIC
2056
998
        COME_FROM_NU:
2057
998
#endif
2058
998
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2059
23
          {
2060
23
          *errorcodeptr = ERR78;
2061
23
          break;
2062
23
          }
2063
975
        c = 0;
2064
975
        overflow = FALSE;
2065
2066
2.59k
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2067
1.66k
          {
2068
1.66k
          ptr++;
2069
1.66k
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2070
#if PCRE2_CODE_UNIT_WIDTH == 32
2071
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2072
#endif
2073
1.38k
          c = (c << 4) | cc;
2074
1.38k
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2075
42
            {
2076
42
            overflow = TRUE;
2077
42
            break;
2078
42
            }
2079
1.38k
          }
2080
2081
        /* Perl ignores spaces and tabs before } */
2082
2083
1.40k
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2084
2085
        /* On overflow, skip remaining hex digits */
2086
2087
975
        if (overflow)
2088
42
          {
2089
442
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2090
42
          *errorcodeptr = ERR34;
2091
42
          }
2092
933
        else if (utf && c >= 0xd800 && c <= 0xdfff &&
2093
3
                 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2094
3
          {
2095
3
          *errorcodeptr = ERR73;
2096
3
          }
2097
930
        else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2098
852
          {
2099
852
          ptr++;
2100
852
          }
2101
2102
        /* If the sequence of hex digits (followed by optional space) does not
2103
        end with '}', give an error. We used just to recognize this construct
2104
        and fall through to the normal \x handling, but nowadays Perl gives an
2105
        error, which seems much more sensible, so we do too. */
2106
2107
78
        else
2108
78
          {
2109
78
          *errorcodeptr = ERR67;
2110
78
          goto ESCAPE_FAILED_FORWARD;
2111
78
          }
2112
975
        }   /* End of \x{} processing */
2113
2114
      /* Read a up to two hex digits after \x */
2115
2116
790
      else
2117
790
        {
2118
        /* Perl has the surprising/broken behaviour that \x without following
2119
        hex digits is treated as an escape for NUL. Their source code laments
2120
        this but keeps it for backwards compatibility. A warning is printed
2121
        when "use warnings" is enabled. Because we don't have warnings, we
2122
        simply forbid it. */
2123
790
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2124
31
          {
2125
          /* Not a hex digit */
2126
31
          *errorcodeptr = ERR78;
2127
31
          break;
2128
31
          }
2129
759
        ptr++;
2130
759
        c = cc;
2131
2132
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2133
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2134
        strict, and there seems little incentive to align with that, given the
2135
        backwards-compatibility cost.
2136
2137
        For comparison, note that other engines disagree. For example:
2138
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2139
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2140
        */
2141
759
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2142
336
        ptr++;
2143
336
        c = (c << 4) | cc;
2144
336
        }     /* End of \xdd handling */
2145
1.71k
      }       /* End of Perl-style \x handling */
2146
1.45k
    break;
2147
2148
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2149
    ASCII (or Unicode) environment, an error is given if the character
2150
    following \c is not a printable ASCII character. Otherwise, the following
2151
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2152
    flipped. The result is the value of the escape.
2153
2154
    In an EBCDIC environment the handling of \c is compatible with the
2155
    specification in the perlebcdic document. The following character must be
2156
    a letter or one of small number of special characters. These provide a
2157
    means of defining the character values 0-31.
2158
2159
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2160
    the EBCDIC value of 'c' explicitly. */
2161
2162
1.45k
    case CHAR_c:
2163
1.12k
    if (ptr >= ptrend)
2164
3
      {
2165
3
      *errorcodeptr = ERR2;
2166
3
      break;
2167
3
      }
2168
1.11k
    c = *ptr;
2169
1.11k
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2170
2171
    /* Handle \c in an ASCII/Unicode environment. */
2172
2173
1.11k
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2174
1.11k
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2175
14
      {
2176
14
      *errorcodeptr = ERR68;
2177
14
      goto ESCAPE_FAILED_FORWARD;
2178
14
      }
2179
1.10k
    c ^= 0x40;
2180
2181
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2182
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2183
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2184
    The other valid sequences correspond to a list of specific characters. */
2185
2186
#else
2187
    if (c == CHAR_QUESTION_MARK)
2188
      c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;
2189
    else
2190
      {
2191
      for (i = 0; i < 32; i++)
2192
        {
2193
        if (c == ebcdic_escape_c[i]) break;
2194
        }
2195
      if (i < 32)
2196
        c = i;
2197
      else
2198
        {
2199
        *errorcodeptr = ERR68;
2200
        goto ESCAPE_FAILED_FORWARD;
2201
        }
2202
      }
2203
#endif  /* EBCDIC */
2204
2205
1.10k
    ptr++;
2206
1.10k
    break;
2207
2208
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2209
    if in warning mode, but PCRE doesn't have a warning mode. */
2210
2211
9
    default:
2212
9
    *errorcodeptr = ERR3;
2213
9
    break;
2214
33.9k
    }
2215
33.9k
  }
2216
2217
/* Set the pointer to the next character before returning. */
2218
2219
538k
EXIT:
2220
538k
*ptrptr = ptr;
2221
538k
*chptr = c;
2222
538k
return escape;
2223
2224
/* Some errors need to indicate the next character. */
2225
2226
191
ESCAPE_FAILED_FORWARD:
2227
191
ptr++;
2228
191
#ifdef SUPPORT_UNICODE
2229
191
if (utf) FORWARDCHARTEST(ptr, ptrend);
2230
191
#endif
2231
191
goto EXIT;
2232
538k
}
2233
2234
2235
2236
#ifdef SUPPORT_UNICODE
2237
/*************************************************
2238
*               Handle \P and \p                 *
2239
*************************************************/
2240
2241
/* This function is called after \P or \p has been encountered, provided that
2242
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2243
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2244
after the final code unit of the escape sequence.
2245
2246
Arguments:
2247
  ptrptr         the pattern position pointer
2248
  utf            true if the input is UTF-encoded
2249
  negptr         a boolean that is set TRUE for negation else FALSE
2250
  ptypeptr       an unsigned int that is set to the type value
2251
  pdataptr       an unsigned int that is set to the detailed property value
2252
  errorcodeptr   the error code variable
2253
  cb             the compile data
2254
2255
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2256
*/
2257
2258
static BOOL
2259
get_ucp(PCRE2_SPTR *ptrptr, BOOL utf, BOOL *negptr, uint16_t *ptypeptr,
2260
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2261
21.1k
{
2262
21.1k
uint32_t c;
2263
21.1k
ptrdiff_t i;
2264
21.1k
PCRE2_SIZE bot, top;
2265
21.1k
PCRE2_SPTR ptr = *ptrptr;
2266
21.1k
PCRE2_UCHAR name[50];
2267
21.1k
PCRE2_UCHAR *vptr = NULL;
2268
21.1k
uint16_t ptscript = PT_NOTSCRIPT;
2269
2270
#ifndef MAYBE_UTF_MULTI
2271
(void)utf;  /* Avoid compiler warning */
2272
#endif
2273
2274
21.1k
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2275
21.1k
GETCHARINCTEST(c, ptr);
2276
21.1k
*negptr = FALSE;
2277
2278
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2279
negation. We must be handling Unicode encoding here, though we may be compiling
2280
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2281
input and Unicode input in the same build.) In accordance with Unicode's "loose
2282
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2283
don't use isspace() or tolower() because (a) code points may be greater than
2284
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2285
environment. */
2286
2287
21.1k
if (c == CHAR_LEFT_CURLY_BRACKET)
2288
15.7k
  {
2289
15.7k
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2290
2291
58.9k
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2292
58.9k
    {
2293
61.0k
    REDO:
2294
2295
61.0k
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2296
60.9k
    GETCHARINCTEST(c, ptr);
2297
2298
    /* Skip ignorable Unicode characters. */
2299
2300
60.9k
    if (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2301
59.8k
        (c >= CHAR_HT && c <= CHAR_CR))
2302
1.37k
      {
2303
1.37k
      goto REDO;
2304
1.37k
      }
2305
2306
    /* The first significant character being circumflex negates the meaning of
2307
    the item. */
2308
2309
59.5k
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2310
777
      {
2311
777
      *negptr = TRUE;
2312
777
      goto REDO;
2313
777
      }
2314
2315
58.8k
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2316
2317
    /* Names consist of ASCII letters and digits, but equals and colon may also
2318
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2319
    check for a value between '&' and 'z' suffices because anything else in a
2320
    name or value will cause an "unknown property" error anyway. */
2321
2322
43.2k
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2323
2324
    /* Lower case a capital letter or remember where the name/value separator
2325
    is. */
2326
2327
43.2k
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2328
23.5k
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2329
1.64k
      vptr = name + i;
2330
2331
43.2k
    name[i] = c;
2332
43.2k
    }
2333
2334
  /* Error if the loop didn't end with '}' - either we hit the end of the
2335
  pattern or the name was longer than any legal property name. */
2336
2337
15.5k
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2338
15.5k
  name[i] = 0;
2339
15.5k
  }
2340
2341
/* If { doesn't follow \p or \P there is just one following character, which
2342
must be an ASCII letter. */
2343
2344
5.43k
else if (c >= CHAR_A && c <= CHAR_Z)
2345
2.82k
  {
2346
2.82k
  name[0] = c | 0x20;  /* Lower case */
2347
2.82k
  name[1] = 0;
2348
2.82k
  }
2349
2.61k
else if (c >= CHAR_a && c <= CHAR_z)
2350
2.56k
  {
2351
2.56k
  name[0] = c;
2352
2.56k
  name[1] = 0;
2353
2.56k
  }
2354
52
else goto ERROR_RETURN;
2355
2356
20.9k
*ptrptr = ptr;   /* Update pattern pointer */
2357
2358
/* If the property contains ':' or '=' we have class name and value separately
2359
specified. The following are supported:
2360
2361
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2362
  . Script (synonym sc) for which the property name is the script name
2363
  . Script_Extensions (synonym scx), ditto
2364
2365
As this is a small number, we currently just check the names directly. If this
2366
grows, a sorted table and a switch will be neater.
2367
2368
For both the script properties, set a PT_xxx value so that (1) they can be
2369
distinguished and (2) invalid script names that happen to be the name of
2370
another property can be diagnosed. */
2371
2372
20.9k
if (vptr != NULL)
2373
1.61k
  {
2374
1.61k
  int offset = 0;
2375
1.61k
  PCRE2_UCHAR sname[8];
2376
2377
1.61k
  *vptr = 0;   /* Terminate property name */
2378
1.61k
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2379
1.61k
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2380
1.26k
    {
2381
1.26k
    offset = 4;
2382
1.26k
    sname[0] = CHAR_b;
2383
1.26k
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2384
1.26k
    sname[2] = CHAR_d;
2385
1.26k
    sname[3] = CHAR_i;
2386
1.26k
    }
2387
2388
348
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2389
264
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2390
186
    ptscript = PT_SC;
2391
2392
162
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2393
90
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2394
154
    ptscript = PT_SCX;
2395
2396
8
  else
2397
8
    {
2398
8
    *errorcodeptr = ERR47;
2399
8
    return FALSE;
2400
8
    }
2401
2402
  /* Adjust the string in name[] as needed */
2403
2404
1.60k
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2405
1.60k
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2406
1.60k
  }
2407
2408
/* Search for a recognized property using binary chop. */
2409
2410
20.9k
bot = 0;
2411
20.9k
top = PRIV(utt_size);
2412
2413
163k
while (bot < top)
2414
163k
  {
2415
163k
  int r;
2416
163k
  i = (bot + top) >> 1;
2417
163k
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2418
2419
  /* When a matching property is found, some extra checking is needed when the
2420
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2421
2422
163k
  if (r == 0)
2423
20.8k
    {
2424
20.8k
    *pdataptr = PRIV(utt)[i].value;
2425
20.8k
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2426
20.5k
      {
2427
20.5k
      *ptypeptr = PRIV(utt)[i].type;
2428
20.5k
      return TRUE;
2429
20.5k
      }
2430
2431
327
    switch (PRIV(utt)[i].type)
2432
327
      {
2433
143
      case PT_SC:
2434
143
      *ptypeptr = PT_SC;
2435
143
      return TRUE;
2436
2437
179
      case PT_SCX:
2438
179
      *ptypeptr = ptscript;
2439
179
      return TRUE;
2440
327
      }
2441
2442
5
    break;  /* Non-script found */
2443
327
    }
2444
2445
142k
  if (r > 0) bot = i + 1; else top = i;
2446
142k
  }
2447
2448
52
*errorcodeptr = ERR47;   /* Unrecognized property */
2449
52
return FALSE;
2450
2451
264
ERROR_RETURN:            /* Malformed \P or \p */
2452
264
*errorcodeptr = ERR46;
2453
264
*ptrptr = ptr;
2454
264
return FALSE;
2455
20.9k
}
2456
#endif
2457
2458
2459
2460
/*************************************************
2461
*           Check for POSIX class syntax         *
2462
*************************************************/
2463
2464
/* This function is called when the sequence "[:" or "[." or "[=" is
2465
encountered in a character class. It checks whether this is followed by a
2466
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2467
reach an unescaped ']' without the special preceding character, return FALSE.
2468
2469
Originally, this function only recognized a sequence of letters between the
2470
terminators, but it seems that Perl recognizes any sequence of characters,
2471
though of course unknown POSIX names are subsequently rejected. Perl gives an
2472
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2473
didn't consider this to be a POSIX class. Likewise for [:1234:].
2474
2475
The problem in trying to be exactly like Perl is in the handling of escapes. We
2476
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2477
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2478
below handles the special cases \\ and \], but does not try to do any other
2479
escape processing. This makes it different from Perl for cases such as
2480
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2481
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2482
when Perl does, I think.
2483
2484
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2485
It seems that the appearance of a nested POSIX class supersedes an apparent
2486
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2487
a digit. This is handled by returning FALSE if the start of a new group with
2488
the same terminator is encountered, since the next closing sequence must close
2489
the nested group, not the outer one.
2490
2491
In Perl, unescaped square brackets may also appear as part of class names. For
2492
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2493
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2494
seem right at all. PCRE does not allow closing square brackets in POSIX class
2495
names.
2496
2497
Arguments:
2498
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2499
  ptrend   pointer to the end of the pattern
2500
  endptr   where to return a pointer to the terminating ':', '.', or '='
2501
2502
Returns:   TRUE or FALSE
2503
*/
2504
2505
static BOOL
2506
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2507
38.9k
{
2508
38.9k
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2509
38.9k
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2510
2511
411k
for (; ptrend - ptr >= 2; ptr++)
2512
410k
  {
2513
410k
  if (*ptr == CHAR_BACKSLASH &&
2514
13.2k
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2515
765
    ptr++;
2516
2517
410k
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2518
402k
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2519
2520
392k
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2521
20.9k
    {
2522
20.9k
    *endptr = ptr;
2523
20.9k
    return TRUE;
2524
20.9k
    }
2525
410k
  }
2526
2527
314
return FALSE;
2528
38.9k
}
2529
2530
2531
2532
/*************************************************
2533
*          Check POSIX class name                *
2534
*************************************************/
2535
2536
/* This function is called to check the name given in a POSIX-style class entry
2537
such as [:alnum:].
2538
2539
Arguments:
2540
  ptr        points to the first letter
2541
  len        the length of the name
2542
2543
Returns:     a value representing the name, or -1 if unknown
2544
*/
2545
2546
static int
2547
check_posix_name(PCRE2_SPTR ptr, int len)
2548
20.7k
{
2549
20.7k
const char *pn = posix_names;
2550
20.7k
int yield = 0;
2551
198k
while (posix_name_lengths[yield] != 0)
2552
198k
  {
2553
198k
  if (len == posix_name_lengths[yield] &&
2554
105k
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2555
177k
  pn += posix_name_lengths[yield] + 1;
2556
177k
  yield++;
2557
177k
  }
2558
35
return -1;
2559
20.7k
}
2560
2561
2562
2563
/*************************************************
2564
*       Read a subpattern or VERB name           *
2565
*************************************************/
2566
2567
/* This function is called from parse_regex() below whenever it needs to read
2568
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2569
pointer must be to the preceding character. If that character is '*' we are
2570
reading a verb or alpha assertion name. The pointer is updated to point after
2571
the name, for a VERB or alpha assertion name, or after the name's terminator
2572
for a subpattern name. Returning both the offset and the name pointer is
2573
redundant information, but some callers use one and some the other, so it is
2574
simplest just to return both. When the name is in braces, spaces and tabs are
2575
allowed (and ignored) at either end.
2576
2577
Arguments:
2578
  ptrptr      points to the character pointer variable
2579
  ptrend      points to the end of the input string
2580
  utf         true if the input is UTF-encoded
2581
  terminator  the terminator of a subpattern name must be this
2582
  offsetptr   where to put the offset from the start of the pattern
2583
  nameptr     where to put a pointer to the name in the input
2584
  namelenptr  where to put the length of the name
2585
  errcodeptr  where to put an error code
2586
  cb          pointer to the compile data block
2587
2588
Returns:    TRUE if a name was read
2589
            FALSE otherwise, with error code set
2590
*/
2591
2592
static BOOL
2593
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2594
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2595
  int *errorcodeptr, compile_block *cb)
2596
68.2k
{
2597
68.2k
PCRE2_SPTR ptr = *ptrptr;
2598
68.2k
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2599
68.2k
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2600
2601
68.2k
if (is_braced)
2602
614
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2603
2604
68.2k
if (ptr >= ptrend)                 /* No characters in name */
2605
62
  {
2606
62
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2607
62
                            ERR60; /* Verb not recognized or malformed */
2608
62
  goto FAILED;
2609
62
  }
2610
2611
68.1k
*nameptr = ptr;
2612
68.1k
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2613
2614
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2615
ought to be updated to match. */
2616
2617
/* In UTF mode, a group name may contain letters and decimal digits as defined
2618
by Unicode properties, and underscores, but must not start with a digit. */
2619
2620
68.1k
#ifdef SUPPORT_UNICODE
2621
68.1k
if (utf && is_group)
2622
1.39k
  {
2623
1.39k
  uint32_t c, type;
2624
1.39k
  PCRE2_SPTR p = ptr;
2625
2626
1.39k
  GETCHARINC(c, p);  /* Peek at next character */
2627
1.39k
  type = UCD_CHARTYPE(c);
2628
2629
1.39k
  if (type == ucp_Nd)
2630
3
    {
2631
3
    ptr = p;
2632
3
    *errorcodeptr = ERR44;
2633
3
    goto FAILED;
2634
3
    }
2635
2636
1.39k
  for(;;)
2637
3.52k
    {
2638
3.52k
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2639
1.46k
        c != CHAR_UNDERSCORE) break;
2640
2.15k
    ptr = p;  /* Accept character and peek again */
2641
2.15k
    if (p >= ptrend) break;
2642
2.12k
    GETCHARINC(c, p);
2643
2.12k
    type = UCD_CHARTYPE(c);
2644
2.12k
    }
2645
1.39k
  }
2646
66.7k
else
2647
#else
2648
(void)utf;  /* Avoid compiler warning */
2649
#endif      /* SUPPORT_UNICODE */
2650
2651
/* Handle non-group names and group names in non-UTF modes. A group name must
2652
not start with a digit. If either of the others start with a digit it just
2653
won't be recognized. */
2654
2655
66.7k
  {
2656
66.7k
  if (is_group && IS_DIGIT(*ptr))
2657
3
    {
2658
3
    ++ptr;
2659
3
    *errorcodeptr = ERR44;
2660
3
    goto FAILED;
2661
3
    }
2662
2663
287k
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2664
220k
    {
2665
220k
    ptr++;
2666
220k
    }
2667
66.7k
  }
2668
2669
/* Check name length */
2670
2671
68.1k
if (ptr - *nameptr > MAX_NAME_SIZE)
2672
8
  {
2673
8
  *errorcodeptr = ERR48;
2674
8
  goto FAILED;
2675
8
  }
2676
68.1k
*namelenptr = (uint32_t)(ptr - *nameptr);
2677
2678
/* Subpattern names must not be empty, and their terminator is checked here.
2679
(What follows a verb or alpha assertion name is checked separately.) */
2680
2681
68.1k
if (is_group)
2682
32.8k
  {
2683
32.8k
  if (ptr == *nameptr)
2684
82
    {
2685
82
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2686
82
    goto FAILED;
2687
82
    }
2688
32.8k
  if (is_braced)
2689
531
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2690
32.8k
  if (terminator != 0)
2691
32.6k
    {
2692
32.6k
    if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2693
139
      {
2694
139
      *errorcodeptr = ERR42;
2695
139
      goto FAILED;
2696
139
      }
2697
32.4k
    ptr++;
2698
32.4k
    }
2699
32.8k
  }
2700
2701
67.9k
*ptrptr = ptr;
2702
67.9k
return TRUE;
2703
2704
297
FAILED:
2705
297
*ptrptr = ptr;
2706
297
return FALSE;
2707
68.1k
}
2708
2709
2710
2711
/**************************************************
2712
*        Parse capturing bracket argument list    *
2713
**************************************************/
2714
2715
/* Reads a list of capture references. The references
2716
can be numbers or names.
2717
2718
Arguments:
2719
  ptrptr           points to the character pointer variable
2720
  ptrend           points to the end of the input string
2721
  utf              true if the input is UTF-encoded
2722
  parsed_pattern   the parsed pattern pointer
2723
  offset           last known offset
2724
  errcodeptr       where to put an error code
2725
  cb               pointer to the compile data block
2726
2727
Returns: updated parsed_pattern pointer on success
2728
         NULL otherwise
2729
*/
2730
2731
static uint32_t *
2732
parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
2733
  BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,
2734
  int *errorcodeptr, compile_block *cb)
2735
6.90k
{
2736
6.90k
PCRE2_SIZE next_offset;
2737
6.90k
PCRE2_SPTR ptr = *ptrptr;
2738
6.90k
PCRE2_SPTR name;
2739
6.90k
PCRE2_UCHAR terminator;
2740
6.90k
uint32_t meta, namelen;
2741
6.90k
int i;
2742
2743
6.90k
if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)
2744
6
  {
2745
6
  *errorcodeptr = ERR118;
2746
6
  goto FAILED;
2747
6
  }
2748
2749
6.89k
for (;;)
2750
8.40k
  {
2751
8.40k
  ptr++;
2752
8.40k
  next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
2753
2754
8.40k
  if (ptr >= ptrend)
2755
4
    {
2756
4
    *errorcodeptr = ERR117;
2757
4
    goto FAILED;
2758
4
    }
2759
2760
  /* Handle [+-]number cases */
2761
8.40k
  if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
2762
8.40k
      &i, errorcodeptr))
2763
6.82k
    {
2764
6.82k
    PCRE2_ASSERT(i >= 0);
2765
6.82k
    if (i <= 0)
2766
14
      {
2767
14
      *errorcodeptr = ERR15;
2768
14
      goto FAILED;
2769
14
      }
2770
6.81k
    meta = META_CAPTURE_NUMBER;
2771
6.81k
    namelen = (uint32_t)i;
2772
6.81k
    }
2773
1.57k
  else if (*errorcodeptr != 0) goto FAILED; /* Number too big */
2774
1.53k
  else
2775
1.53k
    {
2776
    /* Handle 'name' or <name> cases. */
2777
1.53k
    if (*ptr == CHAR_LESS_THAN_SIGN)
2778
863
      terminator = CHAR_GREATER_THAN_SIGN;
2779
674
    else if (*ptr == CHAR_APOSTROPHE)
2780
650
      terminator = CHAR_APOSTROPHE;
2781
24
    else
2782
24
      {
2783
24
      *errorcodeptr = ERR117;
2784
24
      goto FAILED;
2785
24
      }
2786
2787
1.51k
    if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
2788
1.51k
        &name, &namelen, errorcodeptr, cb)) goto FAILED;
2789
2790
1.50k
    meta = META_CAPTURE_NAME;
2791
1.50k
    }
2792
2793
8.31k
  PCRE2_ASSERT(next_offset > 0);
2794
8.31k
  if (offset == 0 || (next_offset - offset) >= 0x10000)
2795
1.90k
    {
2796
1.90k
    *parsed_pattern++ = META_OFFSET;
2797
1.90k
    PUTOFFSET(next_offset, parsed_pattern);
2798
1.90k
    offset = next_offset;
2799
1.90k
    }
2800
2801
  /* The offset is encoded as a relative offset, because for some
2802
  inputs such as ",2" in (1,2,3), we only have space for two uint32_t
2803
  values, and an opcode and absolute offset may require three uint32_t
2804
  values. */
2805
8.31k
  *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
2806
8.31k
  *parsed_pattern++ = namelen;
2807
8.31k
  offset = next_offset;
2808
2809
8.31k
  if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
2810
2811
8.30k
  if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
2812
2813
1.51k
  if (*ptr != CHAR_COMMA)
2814
9
    {
2815
9
    *errorcodeptr = ERR24;
2816
9
    goto FAILED;
2817
9
    }
2818
1.51k
  }
2819
2820
6.79k
*ptrptr = ptr + 1;
2821
6.79k
return parsed_pattern;
2822
2823
10
UNCLOSED_PARENTHESIS:
2824
10
*errorcodeptr = ERR14;
2825
2826
111
FAILED:
2827
111
*ptrptr = ptr;
2828
111
return NULL;
2829
10
}
2830
2831
2832
2833
/*************************************************
2834
*          Manage callouts at start of cycle     *
2835
*************************************************/
2836
2837
/* At the start of a new item in parse_regex() we are able to record the
2838
details of the previous item in a prior callout, and also to set up an
2839
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2840
which would otherwise happen for items such as \Q that contribute nothing to
2841
the parsed pattern.
2842
2843
Arguments:
2844
  ptr              current pattern pointer
2845
  pcalloutptr      points to a pointer to previous callout, or NULL
2846
  auto_callout     TRUE if auto_callouts are enabled
2847
  parsed_pattern   the parsed pattern pointer
2848
  cb               compile block
2849
2850
Returns: possibly updated parsed_pattern pointer.
2851
*/
2852
2853
static uint32_t *
2854
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2855
  uint32_t *parsed_pattern, compile_block *cb)
2856
8.16M
{
2857
8.16M
uint32_t *previous_callout = *pcalloutptr;
2858
2859
8.16M
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2860
1.24M
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2861
2862
8.16M
if (!auto_callout) previous_callout = NULL; else
2863
1.25M
  {
2864
1.25M
  if (previous_callout == NULL ||
2865
1.24M
      previous_callout != parsed_pattern - 4 ||
2866
3.52k
      previous_callout[3] != 255)
2867
1.25M
    {
2868
1.25M
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2869
1.25M
    parsed_pattern += 4;
2870
1.25M
    previous_callout[0] = META_CALLOUT_NUMBER;
2871
1.25M
    previous_callout[2] = 0;
2872
1.25M
    previous_callout[3] = 255;
2873
1.25M
    }
2874
1.25M
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2875
1.25M
  }
2876
2877
8.16M
*pcalloutptr = previous_callout;
2878
8.16M
return parsed_pattern;
2879
8.16M
}
2880
2881
2882
2883
/*************************************************
2884
*          Handle \d, \D, \s, \S, \w, \W         *
2885
*************************************************/
2886
2887
/* This function is called from parse_regex() below, both for freestanding
2888
escapes, and those within classes, to handle those escapes that may change when
2889
Unicode property support is requested. Note that PCRE2_UCP will never be set
2890
without Unicode support because that is checked when pcre2_compile() is called.
2891
2892
Arguments:
2893
  escape          the ESC_... value
2894
  parsed_pattern  where to add the code
2895
  options         options bits
2896
  xoptions        extra options bits
2897
2898
Returns:          updated value of parsed_pattern
2899
*/
2900
static uint32_t *
2901
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2902
  uint32_t xoptions)
2903
205k
{
2904
205k
uint32_t ascii_option = 0;
2905
205k
uint32_t prop = ESC_p;
2906
2907
205k
switch(escape)
2908
205k
  {
2909
16.4k
  case ESC_D:
2910
16.4k
  prop = ESC_P;
2911
16.4k
  PCRE2_FALLTHROUGH /* Fall through */
2912
39.4k
  case ESC_d:
2913
39.4k
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2914
39.4k
  break;
2915
2916
40.7k
  case ESC_S:
2917
40.7k
  prop = ESC_P;
2918
40.7k
  PCRE2_FALLTHROUGH /* Fall through */
2919
89.2k
  case ESC_s:
2920
89.2k
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2921
89.2k
  break;
2922
2923
22.0k
  case ESC_W:
2924
22.0k
  prop = ESC_P;
2925
22.0k
  PCRE2_FALLTHROUGH /* Fall through */
2926
76.2k
  case ESC_w:
2927
76.2k
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2928
76.2k
  break;
2929
205k
  }
2930
2931
205k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2932
162k
  {
2933
162k
  *parsed_pattern++ = META_ESCAPE + escape;
2934
162k
  }
2935
42.3k
else
2936
42.3k
  {
2937
42.3k
  *parsed_pattern++ = META_ESCAPE + prop;
2938
42.3k
  switch(escape)
2939
42.3k
    {
2940
6.38k
    case ESC_d:
2941
10.0k
    case ESC_D:
2942
10.0k
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2943
10.0k
    break;
2944
2945
6.20k
    case ESC_s:
2946
15.9k
    case ESC_S:
2947
15.9k
    *parsed_pattern++ = PT_SPACE << 16;
2948
15.9k
    break;
2949
2950
10.9k
    case ESC_w:
2951
16.2k
    case ESC_W:
2952
16.2k
    *parsed_pattern++ = PT_WORD << 16;
2953
16.2k
    break;
2954
42.3k
    }
2955
42.3k
  }
2956
2957
205k
return parsed_pattern;
2958
205k
}
2959
2960
2961
2962
/*************************************************
2963
* Maximum size of parsed_pattern for given input *
2964
*************************************************/
2965
2966
/* This function is called from parse_regex() below, to determine the amount
2967
of memory to allocate for parsed_pattern. It is also called to check whether
2968
the amount of data written respects the amount of memory allocated.
2969
2970
Arguments:
2971
  ptr             points to the start of the pattern
2972
  ptrend          points to the end of the pattern
2973
  utf             TRUE in UTF mode
2974
  options         the options bits
2975
2976
Returns:          the number of uint32_t units for parsed_pattern
2977
*/
2978
static ptrdiff_t
2979
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2980
  uint32_t options)
2981
84.5k
{
2982
84.5k
PCRE2_SIZE big32count = 0;
2983
84.5k
ptrdiff_t parsed_size_needed;
2984
2985
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2986
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2987
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2988
when literal characters greater than META_END (0x80000000) have to be coded as
2989
two units. In this case, therefore, we scan the pattern to check for such
2990
values. */
2991
2992
#if PCRE2_CODE_UNIT_WIDTH == 32
2993
if (!utf)
2994
  {
2995
  PCRE2_SPTR p;
2996
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2997
  }
2998
#else
2999
84.5k
(void)utf;  /* Avoid compiler warning */
3000
84.5k
#endif
3001
3002
84.5k
parsed_size_needed = (ptrend - ptr) + big32count;
3003
3004
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
3005
elements) for each character. This is overkill, but memory is plentiful these
3006
days. */
3007
3008
84.5k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
3009
16.8k
  parsed_size_needed += (ptrend - ptr) * 4;
3010
3011
84.5k
return parsed_size_needed;
3012
84.5k
}
3013
3014
3015
3016
/*************************************************
3017
*      Parse regex and identify named groups     *
3018
*************************************************/
3019
3020
/* This function is called first of all. It scans the pattern and does two
3021
things: (1) It identifies capturing groups and makes a table of named capturing
3022
groups so that information about them is fully available to both the compiling
3023
scans. (2) It writes a parsed version of the pattern with comments omitted and
3024
escapes processed into the parsed_pattern vector.
3025
3026
Arguments:
3027
  ptr             points to the start of the pattern
3028
  options         compiling dynamic options (may change during the scan)
3029
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
3030
  cb              pointer to the compile data block
3031
3032
Returns:   zero on success or a non-zero error code, with the
3033
             error offset placed in the cb field
3034
*/
3035
3036
/* A structure and some flags for dealing with nested groups. */
3037
3038
typedef struct nest_save {
3039
  uint16_t  nest_depth;
3040
  uint16_t  reset_group;
3041
  uint16_t  max_group;
3042
  uint16_t  flags;
3043
  uint32_t  options;
3044
  uint32_t  xoptions;
3045
} nest_save;
3046
3047
56.5k
#define NSF_RESET          0x0001u
3048
35.0k
#define NSF_CONDASSERT     0x0002u
3049
24.1k
#define NSF_ATOMICSR       0x0004u
3050
3051
/* Options that are changeable within the pattern must be tracked during
3052
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
3053
but all must be tracked so that META_OPTIONS items set the correct values for
3054
the main compiling phase. */
3055
3056
56.0k
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
3057
56.0k
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
3058
56.0k
  PCRE2_UNGREEDY)
3059
3060
56.0k
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
3061
56.0k
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
3062
56.0k
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
3063
3064
/* States used for analyzing ranges in character classes. The two OK values
3065
must be last. */
3066
3067
enum {
3068
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
3069
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
3070
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
3071
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
3072
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
3073
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
3074
};
3075
3076
/* States used for analyzing operators and operands in extended character
3077
classes. */
3078
3079
enum {
3080
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
3081
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
3082
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
3083
};
3084
3085
/* States used for determining the parse mode in character classes. The two
3086
PERL_EXT values must be last. */
3087
3088
enum {
3089
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
3090
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
3091
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
3092
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
3093
};
3094
3095
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
3096
the storing of literal values in the main parsed pattern, where they can always
3097
be quantified. */
3098
3099
#if PCRE2_CODE_UNIT_WIDTH == 32
3100
#define PARSED_LITERAL(c, p) \
3101
  { \
3102
  if (c >= META_END) *p++ = META_BIGVALUE; \
3103
  *p++ = c; \
3104
  okquantifier = TRUE; \
3105
  }
3106
#else
3107
7.60M
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
3108
#endif
3109
3110
/* Here's the actual function. */
3111
3112
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
3113
  BOOL *has_lookbehind, compile_block *cb)
3114
84.5k
{
3115
84.5k
uint32_t c;
3116
84.5k
uint32_t delimiter;
3117
84.5k
uint32_t namelen;
3118
84.5k
uint32_t class_range_state;
3119
84.5k
uint32_t class_op_state;
3120
84.5k
uint32_t class_mode_state;
3121
84.5k
uint32_t *class_start;
3122
84.5k
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
3123
84.5k
uint32_t *verbstartptr = NULL;
3124
84.5k
uint32_t *previous_callout = NULL;
3125
84.5k
uint32_t *parsed_pattern = cb->parsed_pattern;
3126
84.5k
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
3127
84.5k
uint32_t *this_parsed_item = NULL;
3128
84.5k
uint32_t *prev_parsed_item = NULL;
3129
84.5k
uint32_t meta_quantifier = 0;
3130
84.5k
uint32_t add_after_mark = 0;
3131
84.5k
uint16_t nest_depth = 0;
3132
84.5k
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
3133
84.5k
int16_t class_maxdepth_m1 = -1;
3134
84.5k
uint16_t hash;
3135
84.5k
int after_manual_callout = 0;
3136
84.5k
int expect_cond_assert = 0;
3137
84.5k
int errorcode = 0;
3138
84.5k
int escape;
3139
84.5k
int i;
3140
84.5k
BOOL inescq = FALSE;
3141
84.5k
BOOL inverbname = FALSE;
3142
84.5k
BOOL utf = (options & PCRE2_UTF) != 0;
3143
84.5k
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
3144
84.5k
BOOL is_dupname;
3145
84.5k
BOOL negate_class;
3146
84.5k
BOOL okquantifier = FALSE;
3147
84.5k
PCRE2_SPTR thisptr;
3148
84.5k
PCRE2_SPTR name;
3149
84.5k
PCRE2_SPTR ptrend = cb->end_pattern;
3150
84.5k
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
3151
84.5k
PCRE2_SPTR class_range_forbid_ptr = NULL;
3152
84.5k
named_group *ng;
3153
84.5k
nest_save *top_nest, *end_nests;
3154
#ifdef PCRE2_DEBUG
3155
uint32_t *parsed_pattern_check;
3156
ptrdiff_t parsed_pattern_extra = 0;
3157
ptrdiff_t parsed_pattern_extra_check = 0;
3158
PCRE2_SPTR ptr_check;
3159
#endif
3160
3161
84.5k
PCRE2_ASSERT(parsed_pattern != NULL);
3162
3163
/* Insert leading items for word and line matching (features provided for the
3164
benefit of pcre2grep). */
3165
3166
84.5k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
3167
0
  {
3168
0
  *parsed_pattern++ = META_CIRCUMFLEX;
3169
0
  *parsed_pattern++ = META_NOCAPTURE;
3170
0
  }
3171
84.5k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
3172
0
  {
3173
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
3174
0
  *parsed_pattern++ = META_NOCAPTURE;
3175
0
  }
3176
3177
#ifdef PCRE2_DEBUG
3178
parsed_pattern_check = parsed_pattern;
3179
ptr_check = ptr;
3180
#endif
3181
3182
/* If the pattern is actually a literal string, process it separately to avoid
3183
cluttering up the main loop. */
3184
3185
84.5k
if ((options & PCRE2_LITERAL) != 0)
3186
0
  {
3187
0
  while (ptr < ptrend)
3188
0
    {
3189
    /* LCOV_EXCL_START */
3190
0
    if (parsed_pattern >= parsed_pattern_end)
3191
0
      {
3192
0
      PCRE2_DEBUG_UNREACHABLE();
3193
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3194
0
      goto FAILED;
3195
0
      }
3196
    /* LCOV_EXCL_STOP */
3197
3198
0
    thisptr = ptr;
3199
0
    GETCHARINCTEST(c, ptr);
3200
0
    if (auto_callout)
3201
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3202
0
        auto_callout, parsed_pattern, cb);
3203
0
    PARSED_LITERAL(c, parsed_pattern);
3204
0
    }
3205
0
  goto PARSED_END;
3206
0
  }
3207
3208
/* Process a real regex which may contain meta-characters. */
3209
3210
84.5k
top_nest = NULL;
3211
84.5k
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3212
3213
/* The size of the nest_save structure might not be a factor of the size of the
3214
workspace. Therefore we must round down end_nests so as to correctly avoid
3215
creating a nest_save that spans the end of the workspace. */
3216
3217
84.5k
end_nests = (nest_save *)((char *)end_nests -
3218
84.5k
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3219
3220
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3221
3222
84.5k
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3223
3224
/* Now scan the pattern */
3225
3226
8.88M
while (ptr < ptrend)
3227
8.80M
  {
3228
8.80M
  int prev_expect_cond_assert;
3229
8.80M
  uint32_t min_repeat = 0, max_repeat = 0;
3230
8.80M
  uint32_t set, unset, *optset;
3231
8.80M
  uint32_t xset, xunset, *xoptset;
3232
8.80M
  uint32_t terminator;
3233
8.80M
  uint32_t prev_meta_quantifier;
3234
8.80M
  BOOL prev_okquantifier;
3235
8.80M
  PCRE2_SPTR tempptr;
3236
8.80M
  PCRE2_SIZE offset;
3237
3238
8.80M
  if (nest_depth > cb->cx->parens_nest_limit)
3239
3
    {
3240
3
    errorcode = ERR19;
3241
3
    goto FAILED;        /* Parentheses too deeply nested */
3242
3
    }
3243
3244
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3245
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3246
  write a little bit too much, everything will appear to be OK, because the
3247
  upfront size is an overestimate... but a malicious pattern could end up
3248
  forcing a write past the buffer end. We must catch this during
3249
  development. */
3250
3251
#ifdef PCRE2_DEBUG
3252
  /* Strong post-write check. Won't help in release builds - at this point
3253
  the write has already occurred so it's too late. However, should stop us
3254
  committing unsafe code. */
3255
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3256
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3257
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3258
  parsed_pattern_check = parsed_pattern;
3259
  parsed_pattern_extra_check = parsed_pattern_extra;
3260
  ptr_check = ptr;
3261
#endif
3262
3263
  /* LCOV_EXCL_START */
3264
8.80M
  if (parsed_pattern >= parsed_pattern_end)
3265
0
    {
3266
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3267
    (but the code below can write many chars). Better than nothing. */
3268
0
    PCRE2_DEBUG_UNREACHABLE();
3269
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3270
0
    goto FAILED;
3271
0
    }
3272
  /* LCOV_EXCL_STOP */
3273
3274
  /* If the last time round this loop something was added, parsed_pattern will
3275
  no longer be equal to this_parsed_item. Remember where the previous item
3276
  started and reset for the next item. Note that sometimes round the loop,
3277
  nothing gets added (e.g. for ignored white space). */
3278
3279
8.80M
  if (this_parsed_item != parsed_pattern)
3280
8.66M
    {
3281
8.66M
    prev_parsed_item = this_parsed_item;
3282
8.66M
    this_parsed_item = parsed_pattern;
3283
8.66M
    }
3284
3285
  /* Get next input character, save its position for callout handling. */
3286
3287
8.80M
  thisptr = ptr;
3288
8.80M
  GETCHARINCTEST(c, ptr);
3289
3290
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3291
  callouts, except when processing a (*VERB) "name".  */
3292
3293
8.80M
  if (inescq)
3294
130k
    {
3295
130k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3296
1.36k
      {
3297
1.36k
      inescq = FALSE;
3298
1.36k
      ptr++;   /* Skip E */
3299
1.36k
      }
3300
128k
    else
3301
128k
      {
3302
128k
      if (inverbname)
3303
444
        {                          /* Don't use PARSED_LITERAL() because it */
3304
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3305
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3306
#endif
3307
444
        *parsed_pattern++ = c;
3308
444
        }
3309
128k
      else
3310
128k
        {
3311
128k
        if (after_manual_callout-- <= 0)
3312
127k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3313
127k
            auto_callout, parsed_pattern, cb);
3314
128k
        PARSED_LITERAL(c, parsed_pattern);
3315
128k
        }
3316
128k
      meta_quantifier = 0;
3317
128k
      }
3318
130k
    continue;  /* Next character */
3319
130k
    }
3320
3321
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3322
  characters up to the closing parenthesis are literals except when
3323
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3324
  and \E and escaped characters are allowed (no character types such as \d). If
3325
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3326
  this by not entering the special (*VERB:NAME) processing - they are then
3327
  picked up below. Note that c is a character, not a code unit, so we must not
3328
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3329
  TRUE in 8-bit mode. */
3330
3331
8.67M
  if (inverbname &&
3332
68.0k
       (
3333
        /* EITHER: not both options set */
3334
68.0k
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3335
68.0k
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3336
7.53k
#ifdef SUPPORT_UNICODE
3337
        /* OR: character > 255 AND not Unicode Pattern White Space */
3338
7.53k
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3339
7.12k
#endif
3340
        /* OR: not a # comment or isspace() white space */
3341
7.12k
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3342
6.55k
#ifdef SUPPORT_UNICODE
3343
        /* and not CHAR_NEL when Unicode is supported */
3344
6.55k
          && c != CHAR_NEL
3345
7.12k
#endif
3346
7.12k
       )))
3347
67.2k
    {
3348
67.2k
    PCRE2_SIZE verbnamelength;
3349
3350
67.2k
    switch(c)
3351
67.2k
      {
3352
56.5k
      default:                     /* Don't use PARSED_LITERAL() because it */
3353
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3354
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3355
#endif
3356
56.5k
      *parsed_pattern++ = c;
3357
56.5k
      break;
3358
3359
6.68k
      case CHAR_RIGHT_PARENTHESIS:
3360
6.68k
      inverbname = FALSE;
3361
      /* This is the length in characters */
3362
6.68k
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3363
      /* But the limit on the length is in code units */
3364
6.68k
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3365
7
        {
3366
7
        ptr--;
3367
7
        errorcode = ERR76;
3368
7
        goto FAILED;
3369
7
        }
3370
6.67k
      *verblengthptr = (uint32_t)verbnamelength;
3371
3372
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3373
      a (*MARK) was generated for the name. We now add the original verb as the
3374
      next item. */
3375
3376
6.67k
      if (add_after_mark != 0)
3377
505
        {
3378
505
        *parsed_pattern++ = add_after_mark;
3379
505
        add_after_mark = 0;
3380
505
        }
3381
6.67k
      break;
3382
3383
4.05k
      case CHAR_BACKSLASH:
3384
4.05k
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3385
768
        {
3386
768
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3387
768
          xoptions, cb->bracount, FALSE, cb);
3388
768
        if (errorcode != 0) goto FAILED;
3389
768
        }
3390
3.28k
      else escape = 0;   /* Treat all as literal */
3391
3392
4.04k
      switch(escape)
3393
4.04k
        {
3394
3.61k
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3395
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3396
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3397
#endif
3398
3.61k
        *parsed_pattern++ = c;
3399
3.61k
        break;
3400
3401
0
        case ESC_ub:
3402
0
        *parsed_pattern++ = CHAR_u;
3403
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3404
0
        break;
3405
3406
111
        case ESC_Q:
3407
111
        inescq = TRUE;
3408
111
        break;
3409
3410
310
        case ESC_E:           /* Ignore */
3411
310
        break;
3412
3413
10
        default:
3414
10
        errorcode = ERR40;    /* Invalid in verb name */
3415
10
        goto FAILED;
3416
4.04k
        }
3417
67.2k
      }
3418
67.2k
    continue;   /* Next character in pattern */
3419
67.2k
    }
3420
3421
  /* Not a verb name character. At this point we must process everything that
3422
  must not change the quantification state. This is mainly comments, but we
3423
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3424
  A+, as in Perl. An isolated \E is ignored. */
3425
3426
8.60M
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3427
394k
    {
3428
394k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3429
3.11k
      {
3430
      /* A literal inside a \Q...\E is not allowed if we are expecting a
3431
      conditional assertion, but an empty \Q\E sequence is OK. */
3432
3.11k
      if (expect_cond_assert > 0 && *ptr == CHAR_Q &&
3433
50
          !(ptrend - ptr >= 3 && ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E))
3434
12
        {
3435
12
        ptr--;
3436
12
        errorcode = ERR28;
3437
12
        goto FAILED;
3438
12
        }
3439
3.10k
      inescq = *ptr == CHAR_Q;
3440
3.10k
      ptr++;
3441
3.10k
      continue;
3442
3.11k
      }
3443
394k
    }
3444
3445
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3446
  character, not a code unit, so we must not use MAX_255 to test its size
3447
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3448
  whitespace characters are those designated as "Pattern White Space" by
3449
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3450
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3451
  subset of space characters that match \h and \v. */
3452
3453
8.60M
  if ((options & PCRE2_EXTENDED) != 0)
3454
1.94M
    {
3455
1.94M
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3456
1.89M
#ifdef SUPPORT_UNICODE
3457
1.89M
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3458
1.89M
#endif
3459
1.89M
    if (c == CHAR_NUMBER_SIGN)
3460
1.72k
      {
3461
110k
      while (ptr < ptrend)
3462
110k
        {
3463
110k
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3464
1.18k
          {                       /* IS_NEWLINE sets cb->nllen. */
3465
1.18k
          ptr += cb->nllen;
3466
1.18k
          break;
3467
1.18k
          }
3468
109k
        ptr++;
3469
109k
#ifdef SUPPORT_UNICODE
3470
109k
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3471
109k
#endif
3472
109k
        }
3473
1.72k
      continue;  /* Next character in pattern */
3474
1.72k
      }
3475
1.89M
    }
3476
3477
  /* Skip over bracketed comments */
3478
3479
8.54M
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3480
1.07M
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3481
685
    {
3482
3.41k
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3483
685
    if (ptr >= ptrend)
3484
15
      {
3485
15
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3486
15
      goto FAILED;        /* to make it easier to debug. */
3487
15
      }
3488
670
    ptr++;
3489
670
    continue;  /* Next character in pattern */
3490
685
    }
3491
3492
  /* If the next item is not a quantifier, fill in length of any previous
3493
  callout and create an auto callout if required. */
3494
3495
8.54M
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3496
8.04M
       (c != CHAR_LEFT_CURLY_BRACKET ||
3497
171k
         (tempptr = ptr,
3498
171k
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3499
7.96M
    {
3500
7.96M
    if (after_manual_callout-- <= 0)
3501
7.95M
      {
3502
7.95M
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3503
7.95M
        parsed_pattern, cb);
3504
7.95M
      this_parsed_item = parsed_pattern;  /* New start for current item */
3505
7.95M
      }
3506
7.96M
    }
3507
3508
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3509
  assertion, possibly preceded by a callout. If the value is 1, we have just
3510
  had the callout and expect an assertion. There must be at least 3 more
3511
  characters in all cases. When expect_cond_assert is 2, we know that the
3512
  current character is an opening parenthesis, as otherwise we wouldn't be
3513
  here. However, when it is 1, we need to check, and it's easiest just to check
3514
  always. Note that expect_cond_assert may be negative, since all callouts just
3515
  decrement it. */
3516
3517
8.54M
  if (expect_cond_assert > 0)
3518
13.1k
    {
3519
13.1k
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3520
13.1k
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3521
13.1k
    if (ok)
3522
13.1k
      {
3523
13.1k
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3524
79
        {
3525
79
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3526
79
        }
3527
13.0k
      else switch(ptr[1])  /* Traditional symbolic format */
3528
13.0k
        {
3529
1.81k
        case CHAR_C:
3530
1.81k
        ok = expect_cond_assert == 2;
3531
1.81k
        break;
3532
3533
6.66k
        case CHAR_EQUALS_SIGN:
3534
8.51k
        case CHAR_EXCLAMATION_MARK:
3535
8.51k
        break;
3536
3537
2.69k
        case CHAR_LESS_THAN_SIGN:
3538
2.69k
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3539
2.69k
        break;
3540
3541
14
        default:
3542
14
        ok = FALSE;
3543
13.0k
        }
3544
13.1k
      }
3545
3546
13.1k
    if (!ok)
3547
76
      {
3548
76
      errorcode = ERR28;
3549
76
      if (expect_cond_assert == 2) goto FAILED;
3550
33
      goto FAILED_BACK;
3551
76
      }
3552
13.1k
    }
3553
3554
  /* Remember whether we are expecting a conditional assertion, and set the
3555
  default for this item. */
3556
3557
8.54M
  prev_expect_cond_assert = expect_cond_assert;
3558
8.54M
  expect_cond_assert = 0;
3559
3560
  /* Remember quantification status for the previous significant item, then set
3561
  default for this item. */
3562
3563
8.54M
  prev_okquantifier = okquantifier;
3564
8.54M
  prev_meta_quantifier = meta_quantifier;
3565
8.54M
  okquantifier = FALSE;
3566
8.54M
  meta_quantifier = 0;
3567
3568
  /* If the previous significant item was a quantifier, adjust the parsed code
3569
  if there is a following modifier. The base meta value is always followed by
3570
  the PLUS and QUERY values, in that order. We do this here rather than after
3571
  reading a quantifier so that intervening comments and /x whitespace can be
3572
  ignored without having to replicate code. */
3573
3574
8.54M
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3575
69.2k
    {
3576
69.2k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3577
69.2k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3578
60.0k
        0x00020000u : 0x00010000u);
3579
69.2k
    continue;  /* Next character in pattern */
3580
69.2k
    }
3581
3582
  /* Process the next item in the main part of a pattern. */
3583
3584
8.47M
  switch(c)
3585
8.47M
    {
3586
4.61M
    default:              /* Non-special character */
3587
4.61M
    PARSED_LITERAL(c, parsed_pattern);
3588
4.61M
    break;
3589
3590
3591
    /* ---- Escape sequence ---- */
3592
3593
390k
    case CHAR_BACKSLASH:
3594
390k
    tempptr = ptr;
3595
390k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3596
390k
      xoptions, cb->bracount, FALSE, cb);
3597
390k
    if (errorcode != 0)
3598
822
      {
3599
1.21k
      ESCAPE_FAILED:
3600
1.21k
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3601
1.21k
        goto FAILED;
3602
0
      ptr = tempptr;
3603
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3604
0
        {
3605
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3606
0
        }
3607
0
      escape = 0;                 /* Treat as literal character */
3608
0
      }
3609
3610
    /* The escape was a data escape or literal character. */
3611
3612
390k
    if (escape == 0)
3613
129k
      {
3614
129k
      PARSED_LITERAL(c, parsed_pattern);
3615
129k
      }
3616
3617
    /* The escape was a back (or forward) reference. We keep the offset in
3618
    order to give a more useful diagnostic for a bad forward reference. For
3619
    references to groups numbered less than 10 we can't use more than two items
3620
    in parsed_pattern because they may be just two characters in the input (and
3621
    in a 64-bit world an offset may need two elements). So for them, the offset
3622
    of the first occurrent is held in a special vector. */
3623
3624
260k
    else if (escape < 0)
3625
15.7k
      {
3626
15.7k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3627
15.7k
      escape = -escape - 1;
3628
15.7k
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3629
15.7k
      if (escape < 10)
3630
12.7k
        {
3631
12.7k
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3632
6.72k
          cb->small_ref_offset[escape] = offset;
3633
12.7k
        }
3634
2.97k
      else
3635
2.97k
        {
3636
2.97k
        PUTOFFSET(offset, parsed_pattern);
3637
2.97k
        }
3638
15.7k
      okquantifier = TRUE;
3639
15.7k
      }
3640
3641
    /* The escape was a character class such as \d etc. or other special
3642
    escape indicator such as \A or \X. Most of them generate just a single
3643
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3644
    value. They are supported only when Unicode is available. The type and
3645
    value are packed into a single 32-bit value so that the whole sequences
3646
    uses only two elements in the parsed_vector. This is because the same
3647
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3648
    set.
3649
3650
    There are also some cases where the escape sequence is followed by a name:
3651
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3652
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3653
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3654
    and returned as a negative value (handled above). A name is coded as an
3655
    offset into the pattern and a length. */
3656
3657
244k
    else switch (escape)
3658
244k
      {
3659
5
      case ESC_C:
3660
5
#ifdef NEVER_BACKSLASH_C
3661
5
      errorcode = ERR85;
3662
5
      goto ESCAPE_FAILED;
3663
#else
3664
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3665
        {
3666
        errorcode = ERR83;
3667
        goto ESCAPE_FAILED;
3668
        }
3669
#endif
3670
0
      okquantifier = TRUE;
3671
0
      *parsed_pattern++ = META_ESCAPE + escape;
3672
0
      break;
3673
3674
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3675
      when \u{ is not followed by hex digits and }. It requests two literal
3676
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3677
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3678
3679
0
      case ESC_ub:
3680
0
      *parsed_pattern++ = CHAR_u;
3681
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3682
0
      break;
3683
3684
5.99k
      case ESC_X:
3685
#ifndef SUPPORT_UNICODE
3686
      errorcode = ERR45;   /* Supported only with Unicode support */
3687
      goto ESCAPE_FAILED;
3688
#endif
3689
19.2k
      case ESC_H:
3690
36.8k
      case ESC_h:
3691
40.4k
      case ESC_N:
3692
49.3k
      case ESC_R:
3693
55.0k
      case ESC_V:
3694
58.9k
      case ESC_v:
3695
58.9k
      okquantifier = TRUE;
3696
58.9k
      *parsed_pattern++ = META_ESCAPE + escape;
3697
58.9k
      break;
3698
3699
29.6k
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3700
29.6k
      *parsed_pattern++ = META_ESCAPE + escape;
3701
29.6k
      break;
3702
3703
      /* Escapes that may change in UCP mode. */
3704
3705
17.0k
      case ESC_d:
3706
27.8k
      case ESC_D:
3707
48.8k
      case ESC_s:
3708
82.5k
      case ESC_S:
3709
125k
      case ESC_w:
3710
137k
      case ESC_W:
3711
137k
      okquantifier = TRUE;
3712
137k
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3713
137k
        xoptions);
3714
137k
      break;
3715
3716
      /* Unicode property matching */
3717
3718
6.93k
      case ESC_P:
3719
11.7k
      case ESC_p:
3720
11.7k
#ifdef SUPPORT_UNICODE
3721
11.7k
        {
3722
11.7k
        BOOL negated;
3723
11.7k
        uint16_t ptype = 0, pdata = 0;
3724
11.7k
        if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
3725
285
          goto ESCAPE_FAILED;
3726
11.4k
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3727
11.4k
        *parsed_pattern++ = META_ESCAPE + escape;
3728
11.4k
        *parsed_pattern++ = (ptype << 16) | pdata;
3729
11.4k
        okquantifier = TRUE;
3730
11.4k
        }
3731
#else
3732
      errorcode = ERR45;
3733
      goto ESCAPE_FAILED;
3734
#endif
3735
0
      break;  /* End \P and \p */
3736
3737
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3738
      numerical or named subroutine call, and control comes here. When used
3739
      with brace delimiters it is a numerical back reference and does not come
3740
      here because check_escape() returns it directly as a reference. \k is
3741
      always a named back reference. */
3742
3743
752
      case ESC_g:
3744
7.04k
      case ESC_k:
3745
7.04k
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3746
6.87k
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3747
6
        {
3748
6
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3749
6
        goto ESCAPE_FAILED;
3750
6
        }
3751
7.04k
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3752
6.40k
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3753
6.23k
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3754
3755
      /* For a non-braced \g, check for a numerical recursion. */
3756
3757
7.04k
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3758
752
        {
3759
752
        PCRE2_SPTR p = ptr + 1;
3760
3761
752
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3762
752
            &errorcode))
3763
368
          {
3764
368
          if (p >= ptrend || *p != terminator)
3765
6
            {
3766
6
            ptr = p;
3767
6
            errorcode = ERR119;  /* Missing terminator for number */
3768
6
            goto ESCAPE_FAILED;
3769
6
            }
3770
362
          ptr = p + 1;
3771
362
          goto SET_RECURSION;
3772
368
          }
3773
384
        if (errorcode != 0) goto ESCAPE_FAILED;
3774
384
        }
3775
3776
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3777
      before } but not for other delimiters. */
3778
3779
6.66k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3780
6.66k
          &errorcode, cb)) goto ESCAPE_FAILED;
3781
3782
      /* \k and \g when used with braces are back references, whereas \g used
3783
      with quotes or angle brackets is a recursion */
3784
3785
6.57k
      *parsed_pattern++ =
3786
6.57k
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3787
6.21k
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3788
6.57k
      *parsed_pattern++ = namelen;
3789
3790
6.57k
      PUTOFFSET(offset, parsed_pattern);
3791
6.57k
      okquantifier = TRUE;
3792
6.57k
      break;  /* End special escape processing */
3793
244k
      }
3794
389k
    break;    /* End escape sequence processing */
3795
3796
3797
    /* ---- Single-character special items ---- */
3798
3799
389k
    case CHAR_CIRCUMFLEX_ACCENT:
3800
56.1k
    *parsed_pattern++ = META_CIRCUMFLEX;
3801
56.1k
    break;
3802
3803
62.7k
    case CHAR_DOLLAR_SIGN:
3804
62.7k
    *parsed_pattern++ = META_DOLLAR;
3805
62.7k
    break;
3806
3807
96.5k
    case CHAR_DOT:
3808
96.5k
    *parsed_pattern++ = META_DOT;
3809
96.5k
    okquantifier = TRUE;
3810
96.5k
    break;
3811
3812
3813
    /* ---- Single-character quantifiers ---- */
3814
3815
148k
    case CHAR_ASTERISK:
3816
148k
    meta_quantifier = META_ASTERISK;
3817
148k
    goto CHECK_QUANTIFIER;
3818
3819
180k
    case CHAR_PLUS:
3820
180k
    meta_quantifier = META_PLUS;
3821
180k
    goto CHECK_QUANTIFIER;
3822
3823
105k
    case CHAR_QUESTION_MARK:
3824
105k
    meta_quantifier = META_QUERY;
3825
105k
    goto CHECK_QUANTIFIER;
3826
3827
3828
    /* ---- Potential {n,m} quantifier ---- */
3829
3830
171k
    case CHAR_LEFT_CURLY_BRACKET:
3831
171k
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3832
171k
        &errorcode))
3833
87.7k
      {
3834
87.7k
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3835
87.7k
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3836
87.7k
      break;                               /* No more quantifier processing */
3837
87.7k
      }
3838
83.5k
    meta_quantifier = META_MINMAX;
3839
    /* Fall through */
3840
3841
3842
    /* ---- Quantifier post-processing ---- */
3843
3844
    /* Check that a quantifier is allowed after the previous item. This
3845
    guarantees that there is a previous item. */
3846
3847
517k
    CHECK_QUANTIFIER:
3848
517k
    if (!prev_okquantifier)
3849
208
      {
3850
208
      errorcode = ERR9;
3851
208
      goto FAILED;
3852
208
      }
3853
3854
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3855
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3856
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3857
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3858
    (*MARK) for when (*ACCEPT) has an argument. */
3859
3860
517k
    if (*prev_parsed_item == META_ACCEPT)
3861
671
      {
3862
671
      uint32_t *p;
3863
8.72k
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3864
671
      *verbstartptr = META_NOCAPTURE;
3865
671
      parsed_pattern[1] = META_KET;
3866
671
      parsed_pattern += 2;
3867
3868
#ifdef PCRE2_DEBUG
3869
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3870
      parsed_pattern_extra -= 2;
3871
#endif
3872
671
      }
3873
3874
    /* Now we can put the quantifier into the parsed pattern vector. At this
3875
    stage, we have only the basic quantifier. The check for a following + or ?
3876
    modifier happens at the top of the loop, after any intervening comments
3877
    have been removed. */
3878
3879
517k
    *parsed_pattern++ = meta_quantifier;
3880
517k
    if (c == CHAR_LEFT_CURLY_BRACKET)
3881
83.4k
      {
3882
83.4k
      *parsed_pattern++ = min_repeat;
3883
83.4k
      *parsed_pattern++ = max_repeat;
3884
83.4k
      }
3885
517k
    break;
3886
3887
3888
    /* ---- Character class ---- */
3889
3890
170k
    case CHAR_LEFT_SQUARE_BRACKET:
3891
3892
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3893
    used for "start of word" and "end of word". As these are otherwise illegal
3894
    sequences, we don't break anything by recognizing them. They are replaced
3895
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3896
    erroneous and are handled by the normal code below. */
3897
3898
170k
    if (ptrend - ptr >= 6 &&
3899
167k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3900
167k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3901
1.15k
      {
3902
1.15k
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3903
3904
1.15k
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3905
206
        {
3906
206
        *parsed_pattern++ = META_LOOKAHEAD;
3907
206
        }
3908
944
      else
3909
944
        {
3910
944
        *parsed_pattern++ = META_LOOKBEHIND;
3911
944
        *has_lookbehind = TRUE;
3912
3913
        /* The offset is used only for the "non-fixed length" error; this won't
3914
        occur here, so just store zero. */
3915
3916
944
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3917
944
        }
3918
3919
1.15k
      if ((options & PCRE2_UCP) == 0)
3920
632
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3921
518
      else
3922
518
        {
3923
518
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3924
518
        *parsed_pattern++ = PT_WORD << 16;
3925
518
        }
3926
1.15k
      *parsed_pattern++ = META_KET;
3927
1.15k
      ptr += 6;
3928
1.15k
      okquantifier = TRUE;
3929
1.15k
      break;
3930
1.15k
      }
3931
3932
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3933
    they are encountered at the top level, so we'll do that too. */
3934
3935
169k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3936
164k
         *ptr == CHAR_EQUALS_SIGN) &&
3937
5.26k
        check_posix_syntax(ptr, ptrend, &tempptr))
3938
113
      {
3939
113
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3940
113
      ptr = tempptr + 2;
3941
113
      goto FAILED;
3942
113
      }
3943
3944
169k
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3945
145k
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3946
3947
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3948
    set c to the '[' character, and ptr to just after the '['. */
3949
3950
169k
    FROM_PERL_EXTENDED_CLASS:
3951
169k
    okquantifier = TRUE;
3952
3953
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3954
    because there are holes in the encoding, and simply using the range A-Z
3955
    (for example) would include the characters in the holes. This applies only
3956
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3957
    in this respect. In order to accommodate this, we keep track of whether
3958
    character values are literal or not, and a state variable for handling
3959
    ranges. */
3960
3961
    /* Loop for the contents of the class. Classes may be nested, if
3962
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3963
3964
    /* c is still set to '[' so the loop will handle the start of the class. */
3965
3966
169k
    class_depth_m1 = -1;
3967
169k
    class_maxdepth_m1 = -1;
3968
169k
    class_range_state = RANGE_NO;
3969
169k
    class_op_state = CLASS_OP_EMPTY;
3970
169k
    class_start = NULL;
3971
3972
169k
    for (;;)
3973
3.18M
      {
3974
3.18M
      BOOL char_is_literal = TRUE;
3975
3976
      /* Inside \Q...\E everything is literal except \E */
3977
3978
3.18M
      if (inescq)
3979
5.28k
        {
3980
5.28k
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3981
558
          {
3982
558
          inescq = FALSE;                   /* Reset literal state */
3983
558
          ptr++;                            /* Skip the 'E' */
3984
558
          goto CLASS_CONTINUE;
3985
558
          }
3986
3987
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3988
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3989
        were're only giving an error if the \Q..\E is non-empty. */
3990
3991
4.72k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3992
3
          {
3993
3
          errorcode = ERR116;
3994
3
          goto FAILED;
3995
3
          }
3996
3997
4.72k
        goto CLASS_LITERAL;
3998
4.72k
        }
3999
4000
      /* Skip over space and tab (only) in extended-more mode, or anywhere
4001
      inside a Perl extended class (which implies /xx). */
4002
4003
3.17M
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
4004
16.3k
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
4005
13.8k
           class_mode_state >= CLASS_MODE_PERL_EXT))
4006
3.92k
        goto CLASS_CONTINUE;
4007
4008
      /* Handle POSIX class names. Perl allows a negation extension of the
4009
      form [:^name:]. A square bracket that doesn't match the syntax is
4010
      treated as a literal. We also recognize the POSIX constructions
4011
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4012
      5.6 and 5.8 do. */
4013
4014
3.17M
      if (class_depth_m1 >= 0 &&
4015
3.00M
          c == CHAR_LEFT_SQUARE_BRACKET &&
4016
166k
          ptrend - ptr >= 3 &&
4017
166k
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
4018
132k
           *ptr == CHAR_EQUALS_SIGN) &&
4019
33.7k
          check_posix_syntax(ptr, ptrend, &tempptr))
4020
20.7k
        {
4021
20.7k
        BOOL posix_negate = FALSE;
4022
20.7k
        int posix_class;
4023
4024
        /* Perl treats a hyphen before a POSIX class as a literal, not the
4025
        start of a range. However, it gives a warning in its warning mode. PCRE
4026
        does not have a warning mode, so we give an error, because this is
4027
        likely an error on the user's part. */
4028
4029
20.7k
        if (class_range_state == RANGE_STARTED)
4030
5
          {
4031
5
          ptr = tempptr + 2;
4032
5
          errorcode = ERR50;
4033
5
          goto FAILED;
4034
5
          }
4035
4036
        /* Perl treats a hyphen after a POSIX class as a literal, not the
4037
        start of a range. However, it gives a warning in its warning mode
4038
        unless the hyphen is the last character in the class. PCRE does not
4039
        have a warning mode, so we give an error, because this is likely an
4040
        error on the user's part.
4041
4042
        Roll back to the hyphen for the error position. */
4043
4044
20.7k
        if (class_range_state == RANGE_FORBID_STARTED)
4045
3
          {
4046
3
          ptr = class_range_forbid_ptr;
4047
3
          errorcode = ERR50;
4048
3
          goto FAILED;
4049
3
          }
4050
4051
        /* Disallow implicit union in Perl extended classes. */
4052
4053
20.7k
        if (class_op_state == CLASS_OP_OPERAND &&
4054
17.3k
            class_mode_state == CLASS_MODE_PERL_EXT)
4055
3
          {
4056
3
          ptr = tempptr + 2;
4057
3
          errorcode = ERR113;
4058
3
          goto FAILED;
4059
3
          }
4060
4061
20.7k
        if (*ptr != CHAR_COLON)
4062
3
          {
4063
3
          ptr = tempptr + 2;
4064
3
          errorcode = ERR13;
4065
3
          goto FAILED;
4066
3
          }
4067
4068
20.7k
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
4069
1.10k
          {
4070
1.10k
          posix_negate = TRUE;
4071
1.10k
          ptr++;
4072
1.10k
          }
4073
4074
20.7k
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4075
20.7k
        ptr = tempptr + 2;
4076
20.7k
        if (posix_class < 0)
4077
35
          {
4078
35
          errorcode = ERR30;
4079
35
          goto FAILED;
4080
35
          }
4081
4082
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
4083
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
4084
        (because it would be interpreted as range). */
4085
4086
20.7k
        class_range_state = RANGE_FORBID_NO;
4087
20.7k
        class_op_state = CLASS_OP_OPERAND;
4088
4089
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
4090
        of the POSIX classes are converted to use Unicode properties \p or \P
4091
        or, in one case, \h or \H. The substitutes table has two values per
4092
        class, containing the type and value of a \p or \P item. The special
4093
        cases are specified with a negative type: a non-zero value causes \h or
4094
        \H to be used, and a zero value falls through to behave like a non-UCP
4095
        POSIX class. There are now also some extra options that force ASCII for
4096
        some classes. */
4097
4098
20.7k
#ifdef SUPPORT_UNICODE
4099
20.7k
        if ((options & PCRE2_UCP) != 0 &&
4100
9.24k
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
4101
8.86k
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
4102
774
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
4103
8.55k
          {
4104
8.55k
          int ptype = posix_substitutes[2*posix_class];
4105
8.55k
          int pvalue = posix_substitutes[2*posix_class + 1];
4106
4107
8.55k
          if (ptype >= 0)
4108
7.24k
            {
4109
7.24k
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
4110
7.24k
            *parsed_pattern++ = (ptype << 16) | pvalue;
4111
7.24k
            goto CLASS_CONTINUE;
4112
7.24k
            }
4113
4114
1.30k
          if (pvalue != 0)
4115
778
            {
4116
778
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
4117
778
            goto CLASS_CONTINUE;
4118
778
            }
4119
4120
          /* Fall through */
4121
1.30k
          }
4122
12.7k
#endif  /* SUPPORT_UNICODE */
4123
4124
        /* Non-UCP POSIX class */
4125
4126
12.7k
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
4127
12.7k
        *parsed_pattern++ = posix_class;
4128
12.7k
        }
4129
4130
      /* Check for the start of the outermost class, or the start of a nested class. */
4131
4132
3.15M
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
4133
315k
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
4134
117k
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
4135
2.95M
               (c == CHAR_LEFT_PARENTHESIS &&
4136
58.8k
                class_mode_state == CLASS_MODE_PERL_EXT))
4137
200k
        {
4138
200k
        uint32_t start_c = c;
4139
200k
        uint32_t new_class_mode_state;
4140
4141
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
4142
        class. */
4143
4144
200k
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
4145
199k
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
4146
2.19k
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
4147
197k
        else
4148
197k
          new_class_mode_state = class_mode_state;
4149
4150
        /* Tidy up the other class before starting the nested class. */
4151
        /* -[ beginning a nested class is a literal '-' */
4152
4153
200k
        if (class_range_state == RANGE_STARTED)
4154
79
          parsed_pattern[-1] = CHAR_MINUS;
4155
4156
        /* Disallow implicit union in Perl extended classes. */
4157
4158
200k
        if (class_op_state == CLASS_OP_OPERAND &&
4159
19.7k
            class_mode_state == CLASS_MODE_PERL_EXT)
4160
3
          {
4161
3
          errorcode = ERR113;
4162
3
          goto FAILED;
4163
3
          }
4164
4165
        /* Validate nesting depth */
4166
200k
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
4167
17
          {
4168
17
          ptr--;  /* Point rightwards at the paren, same as ERR19. */
4169
17
          errorcode = ERR107;  /* Classes too deeply nested */
4170
17
          goto FAILED;
4171
17
          }
4172
4173
        /* Process the character class start. If the first character is '^', set
4174
        the negation flag. If the first few characters (either before or after ^)
4175
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
4176
        This makes for compatibility with Perl. */
4177
4178
200k
        negate_class = FALSE;
4179
200k
        for (;;)
4180
267k
          {
4181
267k
          if (ptr >= ptrend)
4182
64
            {
4183
64
            if (start_c == CHAR_LEFT_PARENTHESIS)
4184
5
              errorcode = ERR14;  /* Missing terminating ')' */
4185
59
            else
4186
59
              errorcode = ERR6;   /* Missing terminating ']' */
4187
64
            goto FAILED;
4188
64
            }
4189
4190
267k
          GETCHARINCTEST(c, ptr);
4191
267k
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
4192
266k
          else if (c == CHAR_BACKSLASH)
4193
21.6k
            {
4194
21.6k
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
4195
21.4k
            else if (ptrend - ptr >= 3 &&
4196
21.2k
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4197
333
              ptr += 3;
4198
21.0k
            else
4199
21.0k
              break;
4200
21.6k
            }
4201
244k
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4202
1.47k
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4203
796
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4204
1.06k
            continue;
4205
243k
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4206
65.4k
            negate_class = TRUE;
4207
177k
          else break;
4208
267k
          }
4209
4210
        /* Now the real contents of the class; c has the first "real" character.
4211
        Empty classes are permitted only if the option is set, and if it's not
4212
        a Perl-extended class. */
4213
4214
199k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4215
19.5k
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4216
9.68k
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4217
9.49k
          {
4218
9.49k
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4219
4220
9.49k
          if (class_start != NULL)
4221
3.89k
            {
4222
3.89k
            PCRE2_ASSERT(class_depth_m1 >= 0);
4223
            /* Represents that the class is an extended class. */
4224
3.89k
            *class_start |= CLASS_IS_ECLASS;
4225
3.89k
            class_start = NULL;
4226
3.89k
            }
4227
4228
9.49k
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4229
4230
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4231
          very first (top-level) class being empty. */
4232
9.49k
          if (class_depth_m1 < 0) break;
4233
4234
5.74k
          class_range_state = RANGE_NO; /* for processing the containing class */
4235
5.74k
          class_op_state = CLASS_OP_OPERAND;
4236
5.74k
          goto CLASS_CONTINUE;
4237
9.49k
          }
4238
4239
        /* Enter a non-empty class. */
4240
4241
190k
        if (class_start != NULL)
4242
12.4k
          {
4243
12.4k
          PCRE2_ASSERT(class_depth_m1 >= 0);
4244
          /* Represents that the class is an extended class. */
4245
12.4k
          *class_start |= CLASS_IS_ECLASS;
4246
12.4k
          class_start = NULL;
4247
12.4k
          }
4248
4249
190k
        class_start = parsed_pattern;
4250
190k
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4251
190k
        class_range_state = RANGE_NO;
4252
190k
        class_op_state = CLASS_OP_EMPTY;
4253
190k
        class_mode_state = new_class_mode_state;
4254
190k
        ++class_depth_m1;
4255
190k
        if (class_maxdepth_m1 < class_depth_m1)
4256
175k
          class_maxdepth_m1 = class_depth_m1;
4257
        /* Reset; no op seen yet at new depth. */
4258
190k
        cb->class_op_used[class_depth_m1] = 0;
4259
4260
        /* Implement the special start-of-class literal meaning of ']'. */
4261
190k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4262
10.1k
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4263
10.0k
          {
4264
10.0k
          class_range_state = RANGE_OK_LITERAL;
4265
10.0k
          class_op_state = CLASS_OP_OPERAND;
4266
10.0k
          PARSED_LITERAL(c, parsed_pattern);
4267
10.0k
          goto CLASS_CONTINUE;
4268
10.0k
          }
4269
4270
180k
        continue;  /* We have already loaded c with the next character */
4271
190k
        }
4272
4273
      /* Check for the end of the class. */
4274
4275
2.95M
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4276
2.76M
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4277
187k
        {
4278
        /* In Perl extended mode, the ']' can only be used to match the
4279
        opening '[', and ')' must match an opening parenthesis. */
4280
187k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4281
670
          {
4282
670
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4283
3
            {
4284
3
            errorcode = ERR14;
4285
3
            ptr--;  /* Correct the offset */
4286
3
            goto FAILED;
4287
3
            }
4288
667
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4289
3
            {
4290
3
            errorcode = ERR22;
4291
3
            goto FAILED;
4292
3
            }
4293
667
          }
4294
4295
        /* Check no trailing operator. */
4296
187k
        if (class_op_state == CLASS_OP_OPERATOR)
4297
3
          {
4298
3
          errorcode = ERR110;
4299
3
          goto FAILED;
4300
3
          }
4301
4302
        /* Check no empty expression for Perl extended expressions. */
4303
187k
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4304
662
            class_op_state == CLASS_OP_EMPTY)
4305
6
          {
4306
6
          errorcode = ERR114;
4307
6
          goto FAILED;
4308
6
          }
4309
4310
        /* -] at the end of a class is a literal '-' */
4311
187k
        if (class_range_state == RANGE_STARTED)
4312
10.1k
          parsed_pattern[-1] = CHAR_MINUS;
4313
4314
187k
        *parsed_pattern++ = META_CLASS_END;
4315
4316
187k
        if (--class_depth_m1 < 0)
4317
164k
          {
4318
          /* Check for and consume ')' after '(?[...]'. */
4319
164k
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4320
164k
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4321
617
            {
4322
617
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4323
6
              {
4324
6
              errorcode = ERR115;
4325
6
              goto FAILED;
4326
6
              }
4327
4328
611
            ptr++;
4329
611
            }
4330
4331
164k
          break;
4332
164k
          }
4333
4334
23.5k
        class_range_state = RANGE_NO; /* for processing the containing class */
4335
23.5k
        class_op_state = CLASS_OP_OPERAND;
4336
23.5k
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4337
2.13k
          class_mode_state = CLASS_MODE_PERL_EXT;
4338
        /* The extended class flag has already
4339
        been set for the parent class. */
4340
23.5k
        class_start = NULL;
4341
23.5k
        }
4342
4343
      /* Handle a Perl set binary operator */
4344
4345
2.76M
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4346
5.18k
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4347
3.31k
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4348
2.72k
        {
4349
        /* Check that there was a preceding operand. */
4350
2.72k
        if (class_op_state != CLASS_OP_OPERAND)
4351
20
          {
4352
20
          errorcode = ERR109;
4353
20
          goto FAILED;
4354
20
          }
4355
4356
2.70k
        if (class_start != NULL)
4357
295
          {
4358
295
          PCRE2_ASSERT(class_depth_m1 >= 0);
4359
          /* Represents that the class is an extended class. */
4360
295
          *class_start |= CLASS_IS_ECLASS;
4361
295
          class_start = NULL;
4362
295
          }
4363
4364
2.70k
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4365
2.70k
                     class_range_state != RANGE_FORBID_STARTED);
4366
4367
2.70k
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4368
2.70k
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4369
2.39k
                            c == CHAR_MINUS? META_ECLASS_SUB :
4370
2.29k
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4371
838
                            META_ECLASS_XOR;
4372
2.70k
        class_range_state = RANGE_NO;
4373
2.70k
        class_op_state = CLASS_OP_OPERATOR;
4374
2.70k
        }
4375
4376
      /* Handle a Perl set unary operator */
4377
4378
2.76M
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4379
2.46k
               c == CHAR_EXCLAMATION_MARK)
4380
1.15k
        {
4381
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4382
        start of the class, or follows an operator). */
4383
1.15k
        if (class_op_state == CLASS_OP_OPERAND)
4384
3
          {
4385
3
          errorcode = ERR113;
4386
3
          goto FAILED;
4387
3
          }
4388
4389
1.14k
        if (class_start != NULL)
4390
377
          {
4391
377
          PCRE2_ASSERT(class_depth_m1 >= 0);
4392
          /* Represents that the class is an extended class. */
4393
377
          *class_start |= CLASS_IS_ECLASS;
4394
377
          class_start = NULL;
4395
377
          }
4396
4397
1.14k
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4398
1.14k
                     class_range_state != RANGE_FORBID_STARTED);
4399
4400
1.14k
        *parsed_pattern++ = META_ECLASS_NOT;
4401
1.14k
        class_range_state = RANGE_NO;
4402
1.14k
        class_op_state = CLASS_OP_OPERATOR;
4403
1.14k
        }
4404
4405
      /* Handle a UTS#18 set operator */
4406
4407
2.76M
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4408
511k
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4409
500k
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4410
20.2k
               ptr < ptrend && *ptr == c)
4411
7.43k
        {
4412
7.43k
        ++ptr;
4413
4414
        /* Check there isn't a triple-repetition. */
4415
7.43k
        if (ptr < ptrend && *ptr == c)
4416
13
          {
4417
265
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4418
13
          errorcode = ERR108;
4419
13
          goto FAILED;
4420
13
          }
4421
4422
        /* Check for a preceding operand. */
4423
7.42k
        if (class_op_state != CLASS_OP_OPERAND)
4424
4
          {
4425
4
          errorcode = ERR109;
4426
4
          goto FAILED;
4427
4
          }
4428
4429
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4430
7.42k
        if (cb->class_op_used[class_depth_m1] != 0 &&
4431
5.09k
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4432
3
          {
4433
3
          errorcode = ERR111;
4434
3
          goto FAILED;
4435
3
          }
4436
4437
7.41k
        if (class_start != NULL)
4438
1.61k
          {
4439
1.61k
          PCRE2_ASSERT(class_depth_m1 >= 0);
4440
          /* Represents that the class is an extended class. */
4441
1.61k
          *class_start |= CLASS_IS_ECLASS;
4442
1.61k
          class_start = NULL;
4443
1.61k
          }
4444
4445
        /* Dangling '-' before an operator is a literal */
4446
7.41k
        if (class_range_state == RANGE_STARTED)
4447
227
          parsed_pattern[-1] = CHAR_MINUS;
4448
4449
7.41k
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4450
7.41k
                            c == CHAR_MINUS? META_ECLASS_SUB :
4451
6.46k
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4452
5.66k
                            META_ECLASS_XOR;
4453
7.41k
        class_range_state = RANGE_NO;
4454
7.41k
        class_op_state = CLASS_OP_OPERATOR;
4455
7.41k
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4456
7.41k
        }
4457
4458
      /* Handle escapes in a class */
4459
4460
2.75M
      else if (c == CHAR_BACKSLASH)
4461
146k
        {
4462
146k
        tempptr = ptr;
4463
146k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4464
146k
          xoptions, cb->bracount, TRUE, cb);
4465
4466
146k
        if (errorcode != 0)
4467
61
          {
4468
61
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4469
0
              class_mode_state >= CLASS_MODE_PERL_EXT)
4470
61
            goto FAILED;
4471
0
          ptr = tempptr;
4472
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4473
0
            {
4474
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4475
0
            }
4476
0
          escape = 0;                 /* Treat as literal character */
4477
0
          }
4478
4479
146k
        switch(escape)
4480
146k
          {
4481
45.5k
          case 0:  /* Escaped character code point is in c */
4482
45.5k
          char_is_literal = FALSE;
4483
45.5k
          goto CLASS_LITERAL;      /* (a few lines above) */
4484
4485
1.99k
          case ESC_b:
4486
1.99k
          c = CHAR_BS;    /* \b is backspace in a class */
4487
1.99k
          char_is_literal = FALSE;
4488
1.99k
          goto CLASS_LITERAL;
4489
4490
253
          case ESC_k:
4491
253
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4492
253
          char_is_literal = FALSE;
4493
253
          goto CLASS_LITERAL;
4494
4495
599
          case ESC_Q:
4496
599
          inescq = TRUE;  /* Enter literal mode */
4497
599
          goto CLASS_CONTINUE;
4498
4499
937
          case ESC_E:     /* Ignore orphan \E */
4500
937
          goto CLASS_CONTINUE;
4501
4502
10
          case ESC_B:     /* Always an error in a class */
4503
15
          case ESC_R:
4504
27
          case ESC_X:
4505
27
          errorcode = ERR7;
4506
27
          goto FAILED;
4507
4508
14
          case ESC_N:     /* Not permitted by Perl either */
4509
14
          errorcode = ERR71;
4510
14
          goto FAILED;
4511
4512
8.01k
          case ESC_H:
4513
15.3k
          case ESC_h:
4514
17.9k
          case ESC_V:
4515
19.7k
          case ESC_v:
4516
19.7k
          *parsed_pattern++ = META_ESCAPE + escape;
4517
19.7k
          break;
4518
4519
          /* These escapes may be converted to Unicode property tests when
4520
          PCRE2_UCP is set. */
4521
4522
5.96k
          case ESC_d:
4523
11.6k
          case ESC_D:
4524
39.1k
          case ESC_s:
4525
46.1k
          case ESC_S:
4526
57.0k
          case ESC_w:
4527
67.8k
          case ESC_W:
4528
67.8k
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4529
67.8k
            xoptions);
4530
67.8k
          break;
4531
4532
          /* Explicit Unicode property matching */
4533
4534
3.47k
          case ESC_P:
4535
9.41k
          case ESC_p:
4536
9.41k
#ifdef SUPPORT_UNICODE
4537
9.41k
            {
4538
9.41k
            BOOL negated;
4539
9.41k
            uint16_t ptype = 0, pdata = 0;
4540
9.41k
            if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
4541
39
              goto FAILED;
4542
4543
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4544
            get converted to the general characteristic L&. That is, upper,
4545
            lower, and title case letters are all conflated. */
4546
4547
9.37k
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4548
1.29k
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4549
1.05k
              {
4550
1.05k
              ptype = PT_LAMP;
4551
1.05k
              pdata = 0;
4552
1.05k
              }
4553
4554
9.37k
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4555
9.37k
            *parsed_pattern++ = META_ESCAPE + escape;
4556
9.37k
            *parsed_pattern++ = (ptype << 16) | pdata;
4557
9.37k
            }
4558
#else
4559
          errorcode = ERR45;
4560
          goto FAILED;
4561
#endif
4562
0
          break;  /* End \P and \p */
4563
4564
          /* All others are not allowed in a class */
4565
4566
          /* LCOV_EXCL_START */
4567
0
          default:
4568
0
          PCRE2_DEBUG_UNREACHABLE();
4569
0
          PCRE2_FALLTHROUGH /* Fall through */
4570
          /* LCOV_EXCL_STOP */
4571
4572
14
          case ESC_A:
4573
20
          case ESC_Z:
4574
26
          case ESC_z:
4575
28
          case ESC_G:
4576
28
          case ESC_K:
4577
28
          case ESC_C:
4578
28
          errorcode = ERR7;
4579
28
          goto FAILED;
4580
146k
          }
4581
4582
        /* All the switch-cases above which end in "break" describe a set
4583
        of characters. None may start a range. */
4584
4585
        /* The second part of a range can be a single-character escape
4586
        sequence (detected above), but not any of the other escapes. Perl
4587
        treats a hyphen as a literal in such circumstances. However, in Perl's
4588
        warning mode, a warning is given, so PCRE now faults it, as it is
4589
        almost certainly a mistake on the user's part. */
4590
4591
97.0k
        if (class_range_state == RANGE_STARTED)
4592
8
          {
4593
8
          errorcode = ERR50;
4594
8
          goto FAILED;
4595
8
          }
4596
4597
        /* Perl gives a warning unless the hyphen following a multi-character
4598
        escape is the last character in the class. PCRE throws an error. */
4599
4600
97.0k
        if (class_range_state == RANGE_FORBID_STARTED)
4601
3
          {
4602
3
          ptr = class_range_forbid_ptr;
4603
3
          errorcode = ERR50;
4604
3
          goto FAILED;
4605
3
          }
4606
4607
        /* Disallow implicit union in Perl extended classes. */
4608
4609
97.0k
        if (class_op_state == CLASS_OP_OPERAND &&
4610
79.3k
            class_mode_state == CLASS_MODE_PERL_EXT)
4611
3
          {
4612
3
          errorcode = ERR113;
4613
3
          goto FAILED;
4614
3
          }
4615
4616
97.0k
        class_range_state = RANGE_FORBID_NO;
4617
97.0k
        class_op_state = CLASS_OP_OPERAND;
4618
97.0k
        }
4619
4620
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4621
      Perl extended class. */
4622
4623
2.60M
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4624
66
        {
4625
66
        errorcode = ERR116;
4626
66
        goto FAILED;
4627
66
        }
4628
4629
      /* Handle potential start of range */
4630
4631
2.60M
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4632
20.8k
        {
4633
20.8k
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4634
17.6k
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4635
20.8k
        class_range_state = RANGE_STARTED;
4636
20.8k
        }
4637
4638
      /* Handle forbidden start of range */
4639
4640
2.58M
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4641
213
        {
4642
213
        *parsed_pattern++ = CHAR_MINUS;
4643
213
        class_range_state = RANGE_FORBID_STARTED;
4644
213
        class_range_forbid_ptr = ptr;
4645
213
        }
4646
4647
      /* Handle a literal character */
4648
4649
2.58M
      else
4650
2.58M
        {
4651
2.63M
        CLASS_LITERAL:
4652
4653
        /* Disallow implicit union in Perl extended classes. */
4654
4655
2.63M
        if (class_op_state == CLASS_OP_OPERAND &&
4656
2.48M
            class_mode_state == CLASS_MODE_PERL_EXT)
4657
4
          {
4658
4
          errorcode = ERR113;
4659
4
          goto FAILED;
4660
4
          }
4661
4662
2.63M
        if (class_range_state == RANGE_STARTED)
4663
10.3k
          {
4664
10.3k
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4665
610
            parsed_pattern--;
4666
9.72k
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4667
66
            {
4668
66
            errorcode = ERR8;
4669
66
            goto FAILED;
4670
66
            }
4671
9.65k
          else
4672
9.65k
            {
4673
9.65k
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4674
667
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4675
9.65k
            PARSED_LITERAL(c, parsed_pattern);
4676
9.65k
            }
4677
10.2k
          class_range_state = RANGE_NO;
4678
10.2k
          class_op_state = CLASS_OP_OPERAND;
4679
10.2k
          }
4680
2.62M
        else if (class_range_state == RANGE_FORBID_STARTED)
4681
9
          {
4682
9
          ptr = class_range_forbid_ptr;
4683
9
          errorcode = ERR50;
4684
9
          goto FAILED;
4685
9
          }
4686
2.62M
        else  /* Potential start of range */
4687
2.62M
          {
4688
2.62M
          class_range_state = char_is_literal?
4689
2.58M
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4690
2.62M
          class_op_state = CLASS_OP_OPERAND;
4691
2.62M
          PARSED_LITERAL(c, parsed_pattern);
4692
2.62M
          }
4693
2.63M
        }
4694
4695
      /* Proceed to next thing in the class. */
4696
4697
2.83M
      CLASS_CONTINUE:
4698
2.83M
      if (ptr >= ptrend)
4699
1.23k
        {
4700
1.23k
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4701
3
          errorcode = ERR14;   /* Missing terminating ')' */
4702
1.23k
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4703
340
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4704
29
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4705
1.21k
        else
4706
1.21k
          errorcode = ERR6;    /* Missing terminating ']' */
4707
1.23k
        goto FAILED;
4708
1.23k
        }
4709
2.83M
      GETCHARINCTEST(c, ptr);
4710
2.83M
      }     /* End of class-processing loop */
4711
4712
168k
    break;  /* End of character class */
4713
4714
4715
    /* ---- Opening parenthesis ---- */
4716
4717
1.07M
    case CHAR_LEFT_PARENTHESIS:
4718
1.07M
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4719
4720
    /* If ( is not followed by ? it is either a capture or a special verb or an
4721
    alpha assertion or a positive non-atomic lookahead. */
4722
4723
1.07M
    if (*ptr != CHAR_QUESTION_MARK)
4724
883k
      {
4725
883k
      const char *vn;
4726
4727
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4728
      off). */
4729
4730
883k
      if (*ptr != CHAR_ASTERISK)
4731
848k
        {
4732
848k
        nest_depth++;
4733
848k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4734
798k
          {
4735
798k
          if (cb->bracount >= MAX_GROUP_NUMBER)
4736
0
            {
4737
0
            errorcode = ERR97;
4738
0
            goto FAILED;
4739
0
            }
4740
798k
          cb->bracount++;
4741
798k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4742
798k
          }
4743
50.4k
        else *parsed_pattern++ = META_NOCAPTURE;
4744
848k
        }
4745
4746
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4747
      quantifier" error rather than "(*MARK) must have an argument". */
4748
4749
35.2k
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4750
8
        break;
4751
4752
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4753
      synonyms for the historical symbolic assertions, but the script run and
4754
      non-atomic lookaround ones are new. They are distinguished by starting
4755
      with a lower case letter. Checking both ends of the alphabet makes this
4756
      work in all character codes. */
4757
4758
35.2k
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4759
4.19k
        {
4760
4.19k
        uint32_t meta;
4761
4762
4.19k
        vn = alasnames;
4763
4.19k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4764
4.19k
          &errorcode, cb)) goto FAILED;
4765
4.19k
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4766
4.18k
        if (*ptr != CHAR_COLON)
4767
14
          {
4768
14
          errorcode = ERR95;  /* Malformed */
4769
14
          goto FAILED_FORWARD;
4770
14
          }
4771
4772
        /* Scan the table of alpha assertion names */
4773
4774
52.9k
        for (i = 0; i < alascount; i++)
4775
52.8k
          {
4776
52.8k
          if (namelen == alasmeta[i].len &&
4777
14.8k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4778
4.15k
            break;
4779
48.7k
          vn += alasmeta[i].len + 1;
4780
48.7k
          }
4781
4782
4.17k
        if (i >= alascount)
4783
16
          {
4784
16
          errorcode = ERR95;  /* Alpha assertion not recognized */
4785
16
          goto FAILED;
4786
16
          }
4787
4788
        /* Check for expecting an assertion condition. If so, only atomic
4789
        lookaround assertions are valid. */
4790
4791
4.15k
        meta = alasmeta[i].meta;
4792
4.15k
        if (prev_expect_cond_assert > 0 &&
4793
74
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4794
6
          {
4795
6
          errorcode = ERR28;  /* Atomic assertion expected */
4796
6
          goto FAILED;
4797
6
          }
4798
4799
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4800
        to the code that handles the traditional symbolic forms. */
4801
4802
4.15k
        switch(meta)
4803
4.15k
          {
4804
          /* LCOV_EXCL_START */
4805
0
          default:
4806
0
          PCRE2_DEBUG_UNREACHABLE();
4807
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4808
0
          goto FAILED;        /* the meta values come from a table above. */
4809
          /* LCOV_EXCL_STOP */
4810
4811
0
          case META_ATOMIC:
4812
0
          goto ATOMIC_GROUP;
4813
4814
69
          case META_LOOKAHEAD:
4815
69
          goto POSITIVE_LOOK_AHEAD;
4816
4817
66
          case META_LOOKAHEAD_NA:
4818
66
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4819
4820
198
          case META_LOOKAHEADNOT:
4821
198
          goto NEGATIVE_LOOK_AHEAD;
4822
4823
1.92k
          case META_SCS:
4824
1.92k
          ptr++;
4825
1.92k
          *parsed_pattern++ = META_SCS;
4826
4827
1.92k
          parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
4828
1.92k
                                              0, &errorcode, cb);
4829
1.92k
          if (parsed_pattern == NULL) goto FAILED;
4830
1.90k
          goto POST_ASSERTION;
4831
4832
1.90k
          case META_LOOKBEHIND:
4833
286
          case META_LOOKBEHINDNOT:
4834
354
          case META_LOOKBEHIND_NA:
4835
354
          *parsed_pattern++ = meta;
4836
354
          ptr--;
4837
354
          goto POST_LOOKBEHIND;
4838
4839
          /* The script run facilities are handled here. Unicode support is
4840
          required (give an error if not, as this is a security issue). Always
4841
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4842
          META_ATOMIC and remember that we need two META_KETs at the end. */
4843
4844
1.12k
          case META_SCRIPT_RUN:
4845
1.54k
          case META_ATOMIC_SCRIPT_RUN:
4846
1.54k
#ifdef SUPPORT_UNICODE
4847
1.54k
          *parsed_pattern++ = META_SCRIPT_RUN;
4848
1.54k
          nest_depth++;
4849
1.54k
          ptr++;
4850
1.54k
          if (meta == META_ATOMIC_SCRIPT_RUN)
4851
419
            {
4852
419
            *parsed_pattern++ = META_ATOMIC;
4853
419
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4854
130
            else if (++top_nest >= end_nests)
4855
0
              {
4856
0
              errorcode = ERR84;
4857
0
              goto FAILED;
4858
0
              }
4859
419
            top_nest->nest_depth = nest_depth;
4860
419
            top_nest->flags = NSF_ATOMICSR;
4861
419
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4862
419
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4863
4864
#ifdef PCRE2_DEBUG
4865
            /* We'll write out two META_KETs for a single ")" in the input
4866
            pattern, so we reserve space for that in our bounds check. */
4867
            parsed_pattern_extra++;
4868
#endif
4869
419
            }
4870
1.54k
          break;
4871
#else  /* SUPPORT_UNICODE */
4872
          errorcode = ERR96;
4873
          goto FAILED;
4874
#endif
4875
4.15k
          }
4876
4.15k
        }
4877
4878
4879
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4880
4881
31.0k
      else
4882
31.0k
        {
4883
31.0k
        vn = verbnames;
4884
31.0k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4885
31.0k
          &errorcode, cb)) goto FAILED;
4886
31.0k
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4887
23.9k
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4888
50
          {
4889
50
          errorcode = ERR60;  /* Malformed */
4890
50
          goto FAILED;
4891
50
          }
4892
4893
        /* Scan the table of verb names */
4894
4895
200k
        for (i = 0; i < verbcount; i++)
4896
200k
          {
4897
200k
          if (namelen == verbs[i].len &&
4898
72.2k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4899
30.9k
            break;
4900
169k
          vn += verbs[i].len + 1;
4901
169k
          }
4902
4903
31.0k
        if (i >= verbcount)
4904
23
          {
4905
23
          errorcode = ERR60;  /* Verb not recognized */
4906
23
          goto FAILED;
4907
23
          }
4908
4909
        /* An empty argument is treated as no argument. */
4910
4911
30.9k
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4912
7.07k
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4913
215
          ptr++;    /* Advance to the closing parens */
4914
4915
        /* Check for mandatory non-empty argument; this is (*MARK) */
4916
4917
30.9k
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4918
3
          {
4919
3
          errorcode = ERR66;
4920
3
          goto FAILED;
4921
3
          }
4922
4923
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4924
        for handling quantified (*ACCEPT). */
4925
4926
30.9k
        verbstartptr = parsed_pattern;
4927
30.9k
        okquantifier = (verbs[i].meta == META_ACCEPT);
4928
#ifdef PCRE2_DEBUG
4929
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4930
        with a non-capturing bracket, if there is a following quantifier. */
4931
        if (okquantifier) parsed_pattern_extra += 2;
4932
#endif
4933
4934
        /* It appears that Perl allows any characters whatsoever, other than a
4935
        closing parenthesis, to appear in arguments ("names"), so we no longer
4936
        insist on letters, digits, and underscores. Perl does not, however, do
4937
        any interpretation within arguments, and has no means of including a
4938
        closing parenthesis. PCRE supports escape processing but only when it
4939
        is requested by an option. We set inverbname TRUE here, and let the
4940
        main loop take care of this so that escape and \x processing is done by
4941
        the main code above. */
4942
4943
30.9k
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4944
6.86k
          {
4945
          /* Some optional arguments can be treated as a preceding (*MARK) */
4946
4947
6.86k
          if (verbs[i].has_arg < 0)
4948
524
            {
4949
524
            add_after_mark = verbs[i].meta;
4950
524
            *parsed_pattern++ = META_MARK;
4951
524
            }
4952
4953
          /* The remaining verbs with arguments (except *MARK) need a different
4954
          opcode. */
4955
4956
6.33k
          else
4957
6.33k
            {
4958
6.33k
            *parsed_pattern++ = verbs[i].meta +
4959
6.33k
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4960
6.33k
            }
4961
4962
          /* Set up for reading the name in the main loop. */
4963
4964
6.86k
          verblengthptr = parsed_pattern++;
4965
6.86k
          verbnamestart = ptr;
4966
6.86k
          inverbname = TRUE;
4967
6.86k
          }
4968
24.1k
        else  /* No verb "name" argument */
4969
24.1k
          {
4970
24.1k
          *parsed_pattern++ = verbs[i].meta;
4971
24.1k
          }
4972
30.9k
        }     /* End of (*VERB) handling */
4973
881k
      break;  /* Done with this parenthesis */
4974
883k
      }       /* End of groups that don't start with (? */
4975
4976
4977
    /* ---- Items starting (? ---- */
4978
4979
    /* The type of item is determined by what follows (?. Handle (?| and option
4980
    changes under "default" because both need a new block on the nest stack.
4981
    Comments starting with (?# are handled above. Note that there is some
4982
    ambiguity about the sequence (?- because if a digit follows it's a relative
4983
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4984
4985
194k
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4986
4987
194k
    switch(*ptr)
4988
194k
      {
4989
20.8k
      default:
4990
20.8k
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4991
295
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4992
4993
      /* We now have either (?| or a (possibly empty) option setting,
4994
      optionally followed by a non-capturing group. */
4995
4996
20.5k
      nest_depth++;
4997
20.5k
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4998
11.4k
      else if (++top_nest >= end_nests)
4999
0
        {
5000
0
        errorcode = ERR84;
5001
0
        goto FAILED;
5002
0
        }
5003
20.5k
      top_nest->nest_depth = nest_depth;
5004
20.5k
      top_nest->flags = 0;
5005
20.5k
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
5006
20.5k
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5007
5008
      /* Start of non-capturing group that resets the capture count for each
5009
      branch. */
5010
5011
20.5k
      if (*ptr == CHAR_VERTICAL_LINE)
5012
4.34k
        {
5013
4.34k
        top_nest->reset_group = (uint16_t)cb->bracount;
5014
4.34k
        top_nest->max_group = (uint16_t)cb->bracount;
5015
4.34k
        top_nest->flags |= NSF_RESET;
5016
4.34k
        cb->external_flags |= PCRE2_DUPCAPUSED;
5017
4.34k
        *parsed_pattern++ = META_NOCAPTURE;
5018
4.34k
        ptr++;
5019
4.34k
        }
5020
5021
      /* Scan for options imnrsxJU to be set or unset. */
5022
5023
16.2k
      else
5024
16.2k
        {
5025
16.2k
        BOOL hyphenok = TRUE;
5026
16.2k
        uint32_t oldoptions = options;
5027
16.2k
        uint32_t oldxoptions = xoptions;
5028
5029
16.2k
        top_nest->reset_group = 0;
5030
16.2k
        top_nest->max_group = 0;
5031
16.2k
        set = unset = 0;
5032
16.2k
        optset = &set;
5033
16.2k
        xset = xunset = 0;
5034
16.2k
        xoptset = &xset;
5035
5036
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
5037
5038
16.2k
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
5039
197
          {
5040
197
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
5041
197
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
5042
197
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
5043
197
          hyphenok = FALSE;
5044
197
          ptr++;
5045
197
          }
5046
5047
27.3k
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
5048
14.3k
                               *ptr != CHAR_COLON)
5049
11.2k
          {
5050
11.2k
          switch (*ptr++)
5051
11.2k
            {
5052
565
            case CHAR_MINUS:
5053
565
            if (!hyphenok)
5054
3
              {
5055
3
              errorcode = ERR94;
5056
3
              goto FAILED;
5057
3
              }
5058
562
            optset = &unset;
5059
562
            xoptset = &xunset;
5060
562
            hyphenok = FALSE;
5061
562
            break;
5062
5063
            /* There are some two-character sequences that start with 'a'. */
5064
5065
1.93k
            case CHAR_a:
5066
1.93k
            if (ptr < ptrend)
5067
1.92k
              {
5068
1.92k
              if (*ptr == CHAR_D)
5069
66
                {
5070
66
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
5071
66
                ptr++;
5072
66
                break;
5073
66
                }
5074
1.85k
              if (*ptr == CHAR_P)
5075
203
                {
5076
203
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
5077
203
                ptr++;
5078
203
                break;
5079
203
                }
5080
1.65k
              if (*ptr == CHAR_S)
5081
66
                {
5082
66
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
5083
66
                ptr++;
5084
66
                break;
5085
66
                }
5086
1.58k
              if (*ptr == CHAR_T)
5087
466
                {
5088
466
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
5089
466
                ptr++;
5090
466
                break;
5091
466
                }
5092
1.12k
              if (*ptr == CHAR_W)
5093
74
                {
5094
74
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
5095
74
                ptr++;
5096
74
                break;
5097
74
                }
5098
1.12k
              }
5099
1.05k
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
5100
1.05k
                        PCRE2_EXTRA_ASCII_BSW|
5101
1.05k
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
5102
1.05k
            break;
5103
5104
3.96k
            case CHAR_J:  /* Record that it changed in the external options */
5105
3.96k
            *optset |= PCRE2_DUPNAMES;
5106
3.96k
            cb->external_flags |= PCRE2_JCHANGED;
5107
3.96k
            break;
5108
5109
1.24k
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
5110
443
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
5111
219
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
5112
854
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
5113
240
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
5114
621
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
5115
5116
            /* If x appears twice it sets the extended extended option. */
5117
5118
1.00k
            case CHAR_x:
5119
1.00k
            *optset |= PCRE2_EXTENDED;
5120
1.00k
            if (ptr < ptrend && *ptr == CHAR_x)
5121
199
              {
5122
199
              *optset |= PCRE2_EXTENDED_MORE;
5123
199
              ptr++;
5124
199
              }
5125
1.00k
            break;
5126
5127
148
            default:
5128
148
            errorcode = ERR11;
5129
148
            goto FAILED;
5130
11.2k
            }
5131
11.2k
          }
5132
5133
        /* If we are setting extended without extended-more, ensure that any
5134
        existing extended-more gets unset. Also, unsetting extended must also
5135
        unset extended-more. */
5136
5137
16.1k
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5138
15.6k
            (unset & PCRE2_EXTENDED) != 0)
5139
810
          unset |= PCRE2_EXTENDED_MORE;
5140
5141
16.1k
        options = (options | set) & (~unset);
5142
16.1k
        xoptions = (xoptions | xset) & (~xunset);
5143
5144
        /* If the options ended with ')' this is not the start of a nested
5145
        group with option changes, so the options change at this level.
5146
        In this case, if the previous level set up a nest block, discard the
5147
        one we have just created. Otherwise adjust it for the previous level.
5148
        If the options ended with ':' we are starting a non-capturing group,
5149
        possibly with an options setting. */
5150
5151
16.1k
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5152
16.0k
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5153
12.9k
          {
5154
12.9k
          nest_depth--;  /* This is not a nested group after all. */
5155
12.9k
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5156
8.03k
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5157
8.59k
          else top_nest->nest_depth = nest_depth;
5158
12.9k
          }
5159
3.06k
        else *parsed_pattern++ = META_NOCAPTURE;
5160
5161
        /* If nothing changed, no need to record. */
5162
5163
16.0k
        if (options != oldoptions || xoptions != oldxoptions)
5164
3.04k
          {
5165
3.04k
          *parsed_pattern++ = META_OPTIONS;
5166
3.04k
          *parsed_pattern++ = options;
5167
3.04k
          *parsed_pattern++ = xoptions;
5168
3.04k
          }
5169
16.0k
        }     /* End options processing */
5170
20.3k
      break;  /* End default case after (? */
5171
5172
5173
      /* ---- Python syntax support ---- */
5174
5175
20.3k
      case CHAR_P:
5176
452
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5177
5178
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5179
5180
439
      if (*ptr == CHAR_LESS_THAN_SIGN)
5181
104
        {
5182
104
        terminator = CHAR_GREATER_THAN_SIGN;
5183
104
        goto DEFINE_NAME;
5184
104
        }
5185
5186
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5187
      call. */
5188
5189
335
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5190
5191
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5192
      else after (?P is an error. */
5193
5194
221
      if (*ptr != CHAR_EQUALS_SIGN)
5195
8
        {
5196
8
        errorcode = ERR41;
5197
8
        goto FAILED_FORWARD;
5198
8
        }
5199
213
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5200
213
          &namelen, &errorcode, cb)) goto FAILED;
5201
210
      *parsed_pattern++ = META_BACKREF_BYNAME;
5202
210
      *parsed_pattern++ = namelen;
5203
210
      PUTOFFSET(offset, parsed_pattern);
5204
210
      okquantifier = TRUE;
5205
210
      break;   /* End of (?P processing */
5206
5207
5208
      /* ---- Recursion/subroutine calls by number ---- */
5209
5210
10.6k
      case CHAR_R:
5211
10.6k
      i = 0;         /* (?R) == (?R0) */
5212
10.6k
      ptr++;
5213
10.6k
      if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))
5214
12
        {
5215
12
        errorcode = ERR58;
5216
12
        goto FAILED;
5217
12
        }
5218
10.6k
      terminator = CHAR_NUL;
5219
10.6k
      goto SET_RECURSION;
5220
5221
      /* An item starting (?- followed by a digit comes here via the "default"
5222
      case because (?- followed by a non-digit is an options setting. */
5223
5224
42.6k
      case CHAR_PLUS:
5225
42.6k
      if (ptr + 1 >= ptrend)
5226
3
        {
5227
3
        ++ptr;
5228
3
        goto UNCLOSED_PARENTHESIS;
5229
3
        }
5230
42.6k
      if (!IS_DIGIT(ptr[1]))
5231
8
        {
5232
8
        errorcode = ERR29;   /* Missing number */
5233
8
        ++ptr;
5234
8
        goto FAILED_FORWARD;
5235
8
        }
5236
42.6k
      PCRE2_FALLTHROUGH /* Fall through */
5237
42.6k
5238
64.0k
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5239
65.5k
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5240
65.8k
      RECURSION_BYNUMBER:
5241
65.8k
      if (!read_number(&ptr, ptrend,
5242
65.8k
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5243
65.8k
          MAX_GROUP_NUMBER, ERR61,
5244
65.8k
          &i, &errorcode)) goto FAILED;
5245
65.8k
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5246
65.8k
      terminator = CHAR_NUL;
5247
5248
76.8k
      SET_RECURSION:
5249
76.8k
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5250
76.8k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5251
      /* End of recursive call by number handling */
5252
76.8k
      goto READ_RECURSION_ARGUMENTS;
5253
5254
5255
      /* ---- Recursion/subroutine calls by name ---- */
5256
5257
85
      case CHAR_AMPERSAND:
5258
199
      RECURSE_BY_NAME:
5259
199
      if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,
5260
199
          &namelen, &errorcode, cb)) goto FAILED;
5261
188
      *parsed_pattern++ = META_RECURSE_BYNAME;
5262
188
      *parsed_pattern++ = namelen;
5263
188
      terminator = CHAR_NUL;
5264
5265
77.0k
      READ_RECURSION_ARGUMENTS:
5266
77.0k
      PUTOFFSET(offset, parsed_pattern);
5267
77.0k
      okquantifier = TRUE;
5268
5269
      /* Arguments are not supported for \g construct. */
5270
77.0k
      if (terminator != CHAR_NUL) break;
5271
5272
76.6k
      if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)
5273
4.98k
        {
5274
4.98k
        parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
5275
4.98k
                                            offset, &errorcode, cb);
5276
4.98k
        if (parsed_pattern == NULL) goto FAILED;
5277
4.98k
        }
5278
5279
76.5k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5280
70
        goto UNCLOSED_PARENTHESIS;
5281
5282
76.4k
      ptr++;
5283
76.4k
      break;
5284
5285
      /* ---- Callout with numerical or string argument ---- */
5286
5287
7.67k
      case CHAR_C:
5288
7.67k
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5289
0
        {
5290
0
        ptr++;
5291
0
        errorcode = ERR103;
5292
0
        goto FAILED;
5293
0
        }
5294
5295
7.67k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5296
5297
      /* If the previous item was a condition starting (?(? an assertion,
5298
      optionally preceded by a callout, is expected. This is checked later on,
5299
      during actual compilation. However we need to identify this kind of
5300
      assertion in this pass because it must not be qualified. The value of
5301
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5302
      for a callout - still leaving a positive value that identifies the
5303
      assertion. Multiple callouts or any other items will make it zero or
5304
      less, which doesn't matter because they will cause an error later. */
5305
5306
7.67k
      expect_cond_assert = prev_expect_cond_assert - 1;
5307
5308
      /* If previous_callout is not NULL, it means this follows a previous
5309
      callout. If it was a manual callout, do nothing; this means its "length
5310
      of next pattern item" field will remain zero. If it was an automatic
5311
      callout, abolish it. */
5312
5313
7.67k
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5314
2.10k
          previous_callout == parsed_pattern - 4 &&
5315
1.36k
          parsed_pattern[-1] == 255)
5316
1.27k
        parsed_pattern = previous_callout;
5317
5318
      /* Save for updating next pattern item length, and skip one item before
5319
      completing. */
5320
5321
7.67k
      previous_callout = parsed_pattern;
5322
7.67k
      after_manual_callout = 1;
5323
5324
      /* Handle a string argument; specific delimiter is required. */
5325
5326
7.67k
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5327
5.29k
        {
5328
5.29k
        PCRE2_SIZE calloutlength;
5329
5.29k
        PCRE2_SPTR startptr = ptr;
5330
5331
5.29k
        delimiter = 0;
5332
33.8k
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5333
33.8k
          {
5334
33.8k
          if (*ptr == PRIV(callout_start_delims)[i])
5335
5.27k
            {
5336
5.27k
            delimiter = PRIV(callout_end_delims)[i];
5337
5.27k
            break;
5338
5.27k
            }
5339
33.8k
          }
5340
5.29k
        if (delimiter == 0)
5341
25
          {
5342
25
          errorcode = ERR82;
5343
25
          goto FAILED_FORWARD;
5344
25
          }
5345
5346
5.27k
        *parsed_pattern = META_CALLOUT_STRING;
5347
5.27k
        parsed_pattern += 3;   /* Skip pattern info */
5348
5349
5.27k
        for (;;)
5350
25.9k
          {
5351
25.9k
          if (++ptr >= ptrend)
5352
35
            {
5353
35
            errorcode = ERR81;
5354
35
            ptr = startptr;   /* To give a more useful message */
5355
35
            goto FAILED;
5356
35
            }
5357
25.8k
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5358
5.23k
            break;
5359
25.8k
          }
5360
5361
5.23k
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5362
5.23k
        if (calloutlength > UINT32_MAX)
5363
0
          {
5364
0
          errorcode = ERR72;
5365
0
          goto FAILED;
5366
0
          }
5367
5.23k
        *parsed_pattern++ = (uint32_t)calloutlength;
5368
5.23k
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5369
5.23k
        PUTOFFSET(offset, parsed_pattern);
5370
5.23k
        }
5371
5372
      /* Handle a callout with an optional numerical argument, which must be
5373
      less than or equal to 255. A missing argument gives 0. */
5374
5375
2.37k
      else
5376
2.37k
        {
5377
2.37k
        int n = 0;
5378
2.37k
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5379
2.37k
        parsed_pattern += 3;                       /* Skip pattern info */
5380
2.72k
        while (ptr < ptrend && IS_DIGIT(*ptr))
5381
358
          {
5382
358
          n = n * 10 + (*ptr++ - CHAR_0);
5383
358
          if (n > 255)
5384
7
            {
5385
7
            errorcode = ERR38;
5386
7
            goto FAILED;
5387
7
            }
5388
358
          }
5389
2.36k
        *parsed_pattern++ = n;
5390
2.36k
        }
5391
5392
      /* Both formats must have a closing parenthesis */
5393
5394
7.60k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5395
35
        {
5396
35
        errorcode = ERR39;
5397
35
        goto FAILED;
5398
35
        }
5399
7.56k
      ptr++;
5400
5401
      /* Remember the offset to the next item in the pattern, and set a default
5402
      length. This should get updated after the next item is read. */
5403
5404
7.56k
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5405
7.56k
      previous_callout[2] = 0;
5406
7.56k
      break;                  /* End callout */
5407
5408
5409
      /* ---- Conditional group ---- */
5410
5411
      /* A condition can be an assertion, a number (referring to a numbered
5412
      group's having been set), a name (referring to a named group), or 'R',
5413
      referring to overall recursion. R<digits> and R&name are also permitted
5414
      for recursion state tests. Numbers may be preceded by + or - to specify a
5415
      relative group number.
5416
5417
      There are several syntaxes for testing a named group: (?(name)) is used
5418
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5419
5420
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5421
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5422
      the Perl DEFINE feature or the Python named test. We look for a name
5423
      first; if not found, we try the other case.
5424
5425
      For compatibility with auto-callouts, we allow a callout to be specified
5426
      before a condition that is an assertion. */
5427
5428
22.3k
      case CHAR_LEFT_PARENTHESIS:
5429
22.3k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5430
22.3k
      nest_depth++;
5431
5432
      /* If the next character is ? or * there must be an assertion next
5433
      (optionally preceded by a callout). We do not check this here, but
5434
      instead we set expect_cond_assert to 2. If this is still greater than
5435
      zero (callouts decrement it) when the next assertion is read, it will be
5436
      marked as a condition that must not be repeated. A value greater than
5437
      zero also causes checking that an assertion (possibly with callout)
5438
      follows. */
5439
5440
22.3k
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5441
11.3k
        {
5442
11.3k
        *parsed_pattern++ = META_COND_ASSERT;
5443
11.3k
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5444
11.3k
        expect_cond_assert = 2;
5445
11.3k
        break;  /* End of conditional */
5446
11.3k
        }
5447
5448
      /* Handle (?([+-]number)... */
5449
5450
10.9k
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5451
10.9k
          &errorcode))
5452
584
        {
5453
584
        PCRE2_ASSERT(i >= 0);
5454
584
        if (i <= 0)
5455
3
          {
5456
3
          errorcode = ERR15;
5457
3
          goto FAILED;
5458
3
          }
5459
581
        *parsed_pattern++ = META_COND_NUMBER;
5460
581
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5461
581
        PUTOFFSET(offset, parsed_pattern);
5462
581
        *parsed_pattern++ = i;
5463
581
        }
5464
10.3k
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5465
5466
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5467
5468
10.3k
      else if (ptrend - ptr >= 10 &&
5469
10.1k
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5470
1.52k
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5471
1.32k
        {
5472
1.32k
        uint32_t ge = 0;
5473
1.32k
        int major = 0;
5474
1.32k
        int minor = 0;
5475
5476
1.32k
        ptr += 7;
5477
1.32k
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5478
533
          {
5479
533
          ge = 1;
5480
533
          ptr++;
5481
533
          }
5482
5483
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5484
        references its argument twice. */
5485
5486
1.32k
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5487
12
          {
5488
12
          errorcode = ERR79;
5489
12
          if (!ge) goto FAILED_FORWARD;
5490
5
          goto FAILED;
5491
12
          }
5492
5493
1.30k
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5494
3
          goto FAILED;
5495
5496
1.30k
        if (ptr < ptrend && *ptr == CHAR_DOT)
5497
94
          {
5498
94
          if (++ptr >= ptrend || !IS_DIGIT(*ptr))
5499
9
            {
5500
9
            errorcode = ERR79;
5501
9
            if (ptr < ptrend) goto FAILED_FORWARD;
5502
3
            goto FAILED;
5503
9
            }
5504
85
          if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &minor, &errorcode))
5505
3
            goto FAILED;
5506
85
          }
5507
1.29k
        if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5508
9
          {
5509
9
          errorcode = ERR79;
5510
9
          if (ptr < ptrend) goto FAILED_FORWARD;
5511
6
          goto FAILED;
5512
9
          }
5513
5514
1.28k
        *parsed_pattern++ = META_COND_VERSION;
5515
1.28k
        *parsed_pattern++ = ge;
5516
1.28k
        *parsed_pattern++ = major;
5517
1.28k
        *parsed_pattern++ = minor;
5518
1.28k
        }
5519
5520
      /* All the remaining cases now require us to read a name. We cannot at
5521
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5522
      recursion test by number or a name, because the named groups have not yet
5523
      all been identified. Those cases are treated as names, but given a
5524
      different META code. */
5525
5526
9.07k
      else
5527
9.07k
        {
5528
9.07k
        BOOL was_r_ampersand = FALSE;
5529
5530
9.07k
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5531
2.36k
          {
5532
2.36k
          terminator = CHAR_RIGHT_PARENTHESIS;
5533
2.36k
          was_r_ampersand = TRUE;
5534
2.36k
          ptr++;
5535
2.36k
          }
5536
6.70k
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5537
205
          terminator = CHAR_GREATER_THAN_SIGN;
5538
6.49k
        else if (*ptr == CHAR_APOSTROPHE)
5539
208
          terminator = CHAR_APOSTROPHE;
5540
6.29k
        else
5541
6.29k
          {
5542
6.29k
          terminator = CHAR_RIGHT_PARENTHESIS;
5543
6.29k
          ptr--;   /* Point to char before name */
5544
6.29k
          }
5545
5546
9.07k
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5547
9.07k
            &errorcode, cb)) goto FAILED;
5548
5549
        /* Handle (?(R&name) */
5550
5551
8.97k
        if (was_r_ampersand)
5552
2.35k
          {
5553
2.35k
          *parsed_pattern = META_COND_RNAME;
5554
2.35k
          ptr--;   /* Back to closing parens */
5555
2.35k
          }
5556
5557
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5558
        special code. Likewise if the name consists of R followed only by
5559
        digits. Otherwise, handle it like a quoted name. */
5560
5561
6.61k
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5562
6.21k
          {
5563
6.21k
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5564
435
            *parsed_pattern = META_COND_DEFINE;
5565
5.77k
          else
5566
5.77k
            {
5567
9.61k
            for (i = 1; i < (int)namelen; i++)
5568
4.17k
              if (!IS_DIGIT(name[i])) break;
5569
5.77k
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5570
4.32k
              META_COND_RNUMBER : META_COND_NAME;
5571
5.77k
            }
5572
6.21k
          ptr--;   /* Back to closing parens */
5573
6.21k
          }
5574
5575
        /* Handle (?('name') or (?(<name>) */
5576
5577
403
        else *parsed_pattern = META_COND_NAME;
5578
5579
        /* All these cases except DEFINE end with the name length and offset;
5580
        DEFINE just has an offset (for the "too many branches" error). */
5581
5582
8.97k
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5583
8.97k
        PUTOFFSET(offset, parsed_pattern);
5584
8.97k
        }  /* End cases that read a name */
5585
5586
      /* Check the closing parenthesis of the condition */
5587
5588
10.8k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5589
15
        {
5590
15
        errorcode = ERR24;
5591
15
        goto FAILED;
5592
15
        }
5593
10.8k
      ptr++;
5594
10.8k
      break;  /* End of condition processing */
5595
5596
5597
      /* ---- Atomic group ---- */
5598
5599
3.03k
      case CHAR_GREATER_THAN_SIGN:
5600
3.03k
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5601
3.03k
      *parsed_pattern++ = META_ATOMIC;
5602
3.03k
      nest_depth++;
5603
3.03k
      ptr++;
5604
3.03k
      break;
5605
5606
5607
      /* ---- Lookahead assertions ---- */
5608
5609
16.3k
      case CHAR_EQUALS_SIGN:
5610
16.4k
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5611
16.4k
      *parsed_pattern++ = META_LOOKAHEAD;
5612
16.4k
      ptr++;
5613
16.4k
      goto POST_ASSERTION;
5614
5615
7.69k
      case CHAR_ASTERISK:
5616
7.76k
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5617
7.76k
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5618
7.76k
      ptr++;
5619
7.76k
      goto POST_ASSERTION;
5620
5621
10.3k
      case CHAR_EXCLAMATION_MARK:
5622
10.5k
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5623
10.5k
      *parsed_pattern++ = META_LOOKAHEADNOT;
5624
10.5k
      ptr++;
5625
10.5k
      goto POST_ASSERTION;
5626
5627
5628
      /* ---- Lookbehind assertions ---- */
5629
5630
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5631
      is the start of the name of a capturing group. */
5632
5633
18.2k
      case CHAR_LESS_THAN_SIGN:
5634
18.2k
      if (ptrend - ptr <= 1 ||
5635
18.2k
         (ptr[1] != CHAR_EQUALS_SIGN &&
5636
13.6k
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5637
6.70k
          ptr[1] != CHAR_ASTERISK))
5638
4.77k
        {
5639
4.77k
        terminator = CHAR_GREATER_THAN_SIGN;
5640
4.77k
        goto DEFINE_NAME;
5641
4.77k
        }
5642
13.4k
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5643
8.87k
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5644
6.93k
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5645
5646
13.8k
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5647
13.8k
      *has_lookbehind = TRUE;
5648
13.8k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5649
13.8k
      PUTOFFSET(offset, parsed_pattern);
5650
13.8k
      ptr += 2;
5651
      /* Fall through */
5652
5653
      /* If the previous item was a condition starting (?(? an assertion,
5654
      optionally preceded by a callout, is expected. This is checked later on,
5655
      during actual compilation. However we need to identify this kind of
5656
      assertion in this pass because it must not be qualified. The value of
5657
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5658
      for a callout - still leaving a positive value that identifies the
5659
      assertion. Multiple callouts or any other items will make it zero or
5660
      less, which doesn't matter because they will cause an error later. */
5661
5662
50.4k
      POST_ASSERTION:
5663
50.4k
      nest_depth++;
5664
50.4k
      if (prev_expect_cond_assert > 0)
5665
11.2k
        {
5666
11.2k
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5667
1.25k
        else if (++top_nest >= end_nests)
5668
0
          {
5669
0
          errorcode = ERR84;
5670
0
          goto FAILED;
5671
0
          }
5672
11.2k
        top_nest->nest_depth = nest_depth;
5673
11.2k
        top_nest->flags = NSF_CONDASSERT;
5674
11.2k
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5675
11.2k
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5676
11.2k
        }
5677
50.4k
      break;
5678
5679
5680
      /* ---- Define a named group ---- */
5681
5682
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5683
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5684
      terminator set to '>'. */
5685
5686
50.4k
      case CHAR_APOSTROPHE:
5687
10.4k
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5688
5689
15.2k
      DEFINE_NAME:
5690
15.2k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5691
15.2k
          &errorcode, cb)) goto FAILED;
5692
5693
      /* We have a name for this capturing group. It is also assigned a number,
5694
      which is its primary means of identification. */
5695
5696
15.2k
      if (cb->bracount >= MAX_GROUP_NUMBER)
5697
0
        {
5698
0
        errorcode = ERR97;
5699
0
        goto FAILED;
5700
0
        }
5701
15.2k
      cb->bracount++;
5702
15.2k
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5703
15.2k
      nest_depth++;
5704
5705
      /* Check not too many names */
5706
5707
15.2k
      if (cb->names_found >= MAX_NAME_COUNT)
5708
0
        {
5709
0
        errorcode = ERR49;
5710
0
        goto FAILED;
5711
0
        }
5712
5713
      /* Adjust the entry size to accommodate the longest name found. */
5714
5715
15.2k
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5716
2.21k
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5717
5718
      /* Scan the list to check for duplicates. For duplicate names, if the
5719
      number is the same, break the loop, which causes the name to be
5720
      discarded; otherwise, if DUPNAMES is not set, give an error.
5721
      If it is set, allow the name with a different number, but continue
5722
      scanning in case this is a duplicate with the same number. For
5723
      non-duplicate names, give an error if the number is duplicated. */
5724
5725
15.2k
      is_dupname = FALSE;
5726
15.2k
      hash = PRIV(compile_get_hash_from_name)(name, namelen);
5727
15.2k
      ng = cb->named_groups;
5728
49.4k
      for (i = 0; i < cb->names_found; i++, ng++)
5729
45.9k
        {
5730
45.9k
        if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&
5731
12.0k
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5732
11.6k
          {
5733
          /* When a bracket is referenced by the same name multiple
5734
          times, is not considered as a duplicate and ignored. */
5735
11.6k
          if (ng->number == cb->bracount) break;
5736
11.0k
          if ((options & PCRE2_DUPNAMES) == 0)
5737
392
            {
5738
392
            errorcode = ERR43;
5739
392
            goto FAILED;
5740
392
            }
5741
5742
10.6k
          ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;
5743
10.6k
          is_dupname = TRUE;                /* Mark as a duplicate */
5744
10.6k
          cb->dupnames = TRUE;              /* Duplicate names exist */
5745
5746
          /* The entry represents a duplicate. */
5747
10.6k
          name = ng->name;
5748
10.6k
          namelen = 0;
5749
5750
          /* Even duplicated names may refer to the same
5751
          capture index. These references are also ignored. */
5752
466k
          for (; i < cb->names_found; i++, ng++)
5753
456k
            if (ng->name == name && ng->number == cb->bracount)
5754
200
              break;
5755
10.6k
          break;
5756
11.0k
          }
5757
34.2k
        else if (ng->number == cb->bracount)
5758
3
          {
5759
3
          errorcode = ERR65;
5760
3
          goto FAILED;
5761
3
          }
5762
45.9k
        }
5763
5764
      /* Ignore duplicate with same number. */
5765
14.8k
      if (i < cb->names_found) break;
5766
5767
      /* Increase the list size if necessary */
5768
5769
14.0k
      if (cb->names_found >= cb->named_group_list_size)
5770
238
        {
5771
238
        uint32_t newsize = cb->named_group_list_size * 2;
5772
238
        named_group *newspace =
5773
238
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5774
238
          cb->cx->memctl.memory_data);
5775
238
        if (newspace == NULL)
5776
0
          {
5777
0
          errorcode = ERR21;
5778
0
          goto FAILED;
5779
0
          }
5780
5781
238
        memcpy(newspace, cb->named_groups,
5782
238
          cb->named_group_list_size * sizeof(named_group));
5783
238
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5784
109
          cb->cx->memctl.free((void *)cb->named_groups,
5785
109
          cb->cx->memctl.memory_data);
5786
238
        cb->named_groups = newspace;
5787
238
        cb->named_group_list_size = newsize;
5788
238
        }
5789
5790
      /* Add this name to the list */
5791
14.0k
      if (is_dupname)
5792
10.4k
        hash |= NAMED_GROUP_IS_DUPNAME;
5793
5794
14.0k
      cb->named_groups[cb->names_found].name = name;
5795
14.0k
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5796
14.0k
      cb->named_groups[cb->names_found].number = cb->bracount;
5797
14.0k
      cb->named_groups[cb->names_found].hash_dup = hash;
5798
14.0k
      cb->names_found++;
5799
14.0k
      break;
5800
5801
5802
      /* ---- Perl extended character class ---- */
5803
5804
      /* These are of the form '(?[...])'. We handle these via the same parser
5805
      that consumes ordinary '[...]' classes, but with a flag set to activate
5806
      the extended behaviour. */
5807
5808
840
      case CHAR_LEFT_SQUARE_BRACKET:
5809
840
      class_mode_state = CLASS_MODE_PERL_EXT;
5810
840
      c = *ptr++;
5811
840
      goto FROM_PERL_EXTENDED_CLASS;
5812
194k
      }        /* End of (? switch */
5813
195k
    break;     /* End of ( handling */
5814
5815
5816
    /* ---- Branch terminators ---- */
5817
5818
    /* Alternation: reset the capture count if we are in a (?| group. */
5819
5820
473k
    case CHAR_VERTICAL_LINE:
5821
473k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5822
28.4k
        (top_nest->flags & NSF_RESET) != 0)
5823
3.74k
      {
5824
3.74k
      if (cb->bracount > top_nest->max_group)
5825
480
        top_nest->max_group = (uint16_t)cb->bracount;
5826
3.74k
      cb->bracount = top_nest->reset_group;
5827
3.74k
      }
5828
473k
    *parsed_pattern++ = META_ALT;
5829
473k
    break;
5830
5831
    /* End of group; reset the capture count to the maximum if we are in a (?|
5832
    group and/or reset the options that are tracked during parsing. Disallow
5833
    quantifier for a condition that is an assertion. */
5834
5835
932k
    case CHAR_RIGHT_PARENTHESIS:
5836
932k
    okquantifier = TRUE;
5837
932k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5838
23.7k
      {
5839
23.7k
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5840
23.7k
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5841
23.7k
      if ((top_nest->flags & NSF_RESET) != 0 &&
5842
4.02k
          top_nest->max_group > cb->bracount)
5843
364
        cb->bracount = top_nest->max_group;
5844
23.7k
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5845
10.7k
        okquantifier = FALSE;
5846
5847
23.7k
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5848
382
        {
5849
382
        *parsed_pattern++ = META_KET;
5850
5851
#ifdef PCRE2_DEBUG
5852
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5853
        parsed_pattern_extra--;
5854
#endif
5855
382
        }
5856
5857
23.7k
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5858
6.04k
        else top_nest--;
5859
23.7k
      }
5860
932k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5861
414
      {
5862
414
      errorcode = ERR22;
5863
414
      goto FAILED;
5864
414
      }
5865
932k
    nest_depth--;
5866
932k
    *parsed_pattern++ = META_KET;
5867
932k
    break;
5868
8.47M
    }  /* End of switch on pattern character */
5869
8.47M
  }    /* End of main character scan loop */
5870
5871
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5872
5873
79.2k
if (inverbname && ptr >= ptrend)
5874
160
  {
5875
160
  errorcode = ERR60;
5876
160
  goto FAILED;
5877
160
  }
5878
5879
5880
79.0k
PARSED_END:
5881
5882
79.0k
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5883
79.0k
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5884
79.0k
               max_parsed_pattern(ptr_check, ptr, utf, options));
5885
5886
/* Manage callout for the final item */
5887
5888
79.0k
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5889
79.0k
  parsed_pattern, cb);
5890
5891
/* Insert trailing items for word and line matching (features provided for the
5892
benefit of pcre2grep). */
5893
5894
79.0k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5895
0
  {
5896
0
  *parsed_pattern++ = META_KET;
5897
0
  *parsed_pattern++ = META_DOLLAR;
5898
0
  }
5899
79.0k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5900
0
  {
5901
0
  *parsed_pattern++ = META_KET;
5902
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5903
0
  }
5904
5905
/* Terminate the parsed pattern, then return success if all groups are closed.
5906
Otherwise we have unclosed parentheses. */
5907
5908
/* LCOV_EXCL_START */
5909
79.0k
if (parsed_pattern >= parsed_pattern_end)
5910
0
  {
5911
0
  PCRE2_DEBUG_UNREACHABLE();
5912
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5913
0
  goto FAILED;
5914
0
  }
5915
/* LCOV_EXCL_STOP */
5916
5917
79.0k
*parsed_pattern = META_END;
5918
79.0k
if (nest_depth == 0) return 0;
5919
5920
1.26k
UNCLOSED_PARENTHESIS:
5921
1.26k
errorcode = ERR14;
5922
5923
/* Come here for all failures. */
5924
5925
6.51k
FAILED:
5926
6.51k
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5927
6.51k
return errorcode;
5928
5929
/* Some errors need to indicate the previous character. */
5930
5931
33
FAILED_BACK:
5932
33
ptr--;
5933
33
#ifdef SUPPORT_UNICODE
5934
33
if (utf) BACKCHAR(ptr);
5935
33
#endif
5936
33
goto FAILED;
5937
5938
/* Some errors need to indicate the next character. */
5939
5940
71
FAILED_FORWARD:
5941
71
ptr++;
5942
71
#ifdef SUPPORT_UNICODE
5943
71
if (utf) FORWARDCHARTEST(ptr, ptrend);
5944
71
#endif
5945
71
goto FAILED;
5946
1.26k
}
5947
5948
5949
5950
/*************************************************
5951
*       Find first significant opcode            *
5952
*************************************************/
5953
5954
/* This is called by several functions that scan a compiled expression looking
5955
for a fixed first character, or an anchoring opcode etc. It skips over things
5956
that do not influence this. For some calls, it makes sense to skip negative
5957
forward and all backward assertions, and also the \b assertion; for others it
5958
does not.
5959
5960
Arguments:
5961
  code         pointer to the start of the group
5962
  skipassert   TRUE if certain assertions are to be skipped
5963
5964
Returns:       pointer to the first significant opcode
5965
*/
5966
5967
static const PCRE2_UCHAR*
5968
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5969
204k
{
5970
204k
for (;;)
5971
541k
  {
5972
541k
  switch ((int)*code)
5973
541k
    {
5974
57.5k
    case OP_ASSERT_NOT:
5975
253k
    case OP_ASSERTBACK:
5976
288k
    case OP_ASSERTBACK_NOT:
5977
297k
    case OP_ASSERTBACK_NA:
5978
297k
    if (!skipassert) return code;
5979
548k
    do code += GET(code, 1); while (*code == OP_ALT);
5980
294k
    code += PRIV(OP_lengths)[*code];
5981
294k
    break;
5982
5983
642
    case OP_WORD_BOUNDARY:
5984
1.33k
    case OP_NOT_WORD_BOUNDARY:
5985
1.66k
    case OP_UCP_WORD_BOUNDARY:
5986
1.83k
    case OP_NOT_UCP_WORD_BOUNDARY:
5987
1.83k
    if (!skipassert) return code;
5988
1.11k
    PCRE2_FALLTHROUGH /* Fall through */
5989
1.11k
5990
39.6k
    case OP_CALLOUT:
5991
39.7k
    case OP_CREF:
5992
39.7k
    case OP_DNCREF:
5993
39.7k
    case OP_RREF:
5994
39.7k
    case OP_DNRREF:
5995
39.7k
    case OP_FALSE:
5996
39.8k
    case OP_TRUE:
5997
39.8k
    code += PRIV(OP_lengths)[*code];
5998
39.8k
    break;
5999
6000
371
    case OP_CALLOUT_STR:
6001
371
    code += GET(code, 1 + 2*LINK_SIZE);
6002
371
    break;
6003
6004
313
    case OP_SKIPZERO:
6005
313
    code += 2 + GET(code, 2) + LINK_SIZE;
6006
313
    break;
6007
6008
1.63k
    case OP_COND:
6009
1.73k
    case OP_SCOND:
6010
1.73k
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
6011
100
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
6012
1.67k
      return code;
6013
63
    code += GET(code, 1) + 1 + LINK_SIZE;
6014
63
    break;
6015
6016
507
    case OP_MARK:
6017
588
    case OP_COMMIT_ARG:
6018
912
    case OP_PRUNE_ARG:
6019
1.30k
    case OP_SKIP_ARG:
6020
1.43k
    case OP_THEN_ARG:
6021
1.43k
    code += code[1] + PRIV(OP_lengths)[*code];
6022
1.43k
    break;
6023
6024
198k
    default:
6025
198k
    return code;
6026
541k
    }
6027
541k
  }
6028
6029
/* LCOV_EXCL_START */
6030
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6031
/* LCOV_EXCL_STOP */
6032
0
}
6033
6034
6035
6036
/*************************************************
6037
*           Compile one branch                   *
6038
*************************************************/
6039
6040
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
6041
the options are changed during the branch, the pointer is used to change the
6042
external options bits. This function is used during the pre-compile phase when
6043
we are trying to find out the amount of memory needed, as well as during the
6044
real compile phase. The value of lengthptr distinguishes the two phases.
6045
6046
Arguments:
6047
  optionsptr        pointer to the option bits
6048
  xoptionsptr       pointer to the extra option bits
6049
  codeptr           points to the pointer to the current code point
6050
  pptrptr           points to the current parsed pattern pointer
6051
  errorcodeptr      points to error code variable
6052
  firstcuptr        place to put the first required code unit
6053
  firstcuflagsptr   place to put the first code unit flags
6054
  reqcuptr          place to put the last required code unit
6055
  reqcuflagsptr     place to put the last required code unit flags
6056
  bcptr             points to current branch chain
6057
  open_caps         points to current capitem
6058
  cb                contains pointers to tables etc.
6059
  lengthptr         NULL during the real compile phase
6060
                    points to length accumulator during pre-compile phase
6061
6062
Returns:            0 There's been an error, *errorcodeptr is non-zero
6063
                   +1 Success, this branch must match at least one character
6064
                   -1 Success, this branch may match an empty string
6065
*/
6066
6067
static int
6068
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
6069
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
6070
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
6071
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
6072
  compile_block *cb, PCRE2_SIZE *lengthptr)
6073
2.14M
{
6074
2.14M
int bravalue = 0;
6075
2.14M
int okreturn = -1;
6076
2.14M
int group_return = 0;
6077
2.14M
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
6078
2.14M
uint32_t greedy_default, greedy_non_default;
6079
2.14M
uint32_t repeat_type, op_type;
6080
2.14M
uint32_t options = *optionsptr;               /* May change dynamically */
6081
2.14M
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
6082
2.14M
uint32_t firstcu, reqcu;
6083
2.14M
uint32_t zeroreqcu, zerofirstcu;
6084
2.14M
uint32_t *pptr = *pptrptr;
6085
2.14M
uint32_t meta, meta_arg;
6086
2.14M
uint32_t firstcuflags, reqcuflags;
6087
2.14M
uint32_t zeroreqcuflags, zerofirstcuflags;
6088
2.14M
uint32_t req_caseopt, reqvary, tempreqvary;
6089
/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,
6090
depends on the previous value of offset. */
6091
2.14M
PCRE2_SIZE offset = 0;
6092
2.14M
PCRE2_SIZE length_prevgroup = 0;
6093
2.14M
PCRE2_UCHAR *code = *codeptr;
6094
2.14M
PCRE2_UCHAR *last_code = code;
6095
2.14M
PCRE2_UCHAR *orig_code = code;
6096
2.14M
PCRE2_UCHAR *tempcode;
6097
2.14M
PCRE2_UCHAR *previous = NULL;
6098
2.14M
PCRE2_UCHAR op_previous;
6099
2.14M
BOOL groupsetfirstcu = FALSE;
6100
2.14M
BOOL had_accept = FALSE;
6101
2.14M
BOOL matched_char = FALSE;
6102
2.14M
BOOL previous_matched_char = FALSE;
6103
2.14M
BOOL reset_caseful = FALSE;
6104
6105
/* We can fish out the UTF setting once and for all into a BOOL, but we must
6106
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
6107
as we process the pattern. */
6108
6109
2.14M
#ifdef SUPPORT_UNICODE
6110
2.14M
BOOL utf = (options & PCRE2_UTF) != 0;
6111
2.14M
BOOL ucp = (options & PCRE2_UCP) != 0;
6112
#else  /* No Unicode support */
6113
BOOL utf = FALSE;
6114
#endif
6115
6116
/* Set up the default and non-default settings for greediness */
6117
6118
2.14M
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6119
2.14M
greedy_non_default = greedy_default ^ 1;
6120
6121
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6122
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6123
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6124
6125
When we hit a repeat whose minimum is zero, we may have to adjust these values
6126
to take the zero repeat into account. This is implemented by setting them to
6127
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6128
item types that can be repeated set these backoff variables appropriately. */
6129
6130
2.14M
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6131
2.14M
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6132
6133
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6134
according to the current setting of the caseless flag. The REQ_CASELESS value
6135
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6136
to record the case status of the value. This is used only for ASCII characters.
6137
*/
6138
6139
2.14M
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6140
6141
/* Switch on next META item until the end of the branch */
6142
6143
12.8M
for (;; pptr++)
6144
15.0M
  {
6145
15.0M
  BOOL possessive_quantifier;
6146
15.0M
  BOOL note_group_empty;
6147
15.0M
  uint32_t mclength;
6148
15.0M
  uint32_t skipunits;
6149
15.0M
  uint32_t subreqcu, subfirstcu;
6150
15.0M
  uint32_t groupnumber;
6151
15.0M
  uint32_t verbarglen, verbculen;
6152
15.0M
  uint32_t subreqcuflags, subfirstcuflags;
6153
15.0M
  open_capitem *oc;
6154
15.0M
  PCRE2_UCHAR mcbuffer[8];
6155
6156
  /* Get next META item in the pattern and its potential argument. */
6157
6158
15.0M
  meta = META_CODE(*pptr);
6159
15.0M
  meta_arg = META_DATA(*pptr);
6160
6161
  /* If we are in the pre-compile phase, accumulate the length used for the
6162
  previous cycle of this loop, unless the next item is a quantifier. */
6163
6164
15.0M
  if (lengthptr != NULL)
6165
7.75M
    {
6166
    /* LCOV_EXCL_START */
6167
7.75M
    if (code >= cb->start_workspace + cb->workspace_size)
6168
0
      {
6169
0
      PCRE2_DEBUG_UNREACHABLE();
6170
0
      *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6171
0
      cb->erroroffset = 0;
6172
0
      return 0;
6173
0
      }
6174
    /* LCOV_EXCL_STOP */
6175
6176
7.75M
    if (code > cb->start_workspace + cb->workspace_size -
6177
7.75M
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6178
0
      {
6179
0
      *errorcodeptr = ERR86;  /* Pattern too complicated */
6180
0
      cb->erroroffset = 0;
6181
0
      return 0;
6182
0
      }
6183
6184
    /* There is at least one situation where code goes backwards: this is the
6185
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6186
    is processed, the whole class is eliminated. However, it is created first,
6187
    so we have to allow memory for it. Therefore, don't ever reduce the length
6188
    at this point. */
6189
6190
7.75M
    if (code < last_code) code = last_code;
6191
6192
    /* If the next thing is not a quantifier, we add the length of the previous
6193
    item into the total, and reset the code pointer to the start of the
6194
    workspace. Otherwise leave the previous item available to be quantified. */
6195
6196
7.75M
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6197
7.29M
      {
6198
7.29M
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6199
1
        {
6200
1
        *errorcodeptr = ERR20;   /* Integer overflow */
6201
1
        cb->erroroffset = 0;
6202
1
        return 0;
6203
1
        }
6204
7.29M
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6205
7.29M
      if (*lengthptr > MAX_PATTERN_SIZE)
6206
59
        {
6207
59
        *errorcodeptr = ERR20;   /* Pattern is too large */
6208
59
        cb->erroroffset = 0;
6209
59
        return 0;
6210
59
        }
6211
7.29M
      code = orig_code;
6212
7.29M
      }
6213
6214
    /* Remember where this code item starts so we can catch the "backwards"
6215
    case above next time round. */
6216
6217
7.75M
    last_code = code;
6218
7.75M
    }
6219
6220
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6221
  where it starts so that it can be quantified when a quantifier follows.
6222
  Checking for the legality of quantifiers happens in parse_regex(), except for
6223
  a quantifier after an assertion that is a condition. */
6224
6225
15.0M
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6226
14.1M
    {
6227
14.1M
    previous = code;
6228
14.1M
    if (matched_char && !had_accept) okreturn = 1;
6229
14.1M
    }
6230
6231
15.0M
  previous_matched_char = matched_char;
6232
15.0M
  matched_char = FALSE;
6233
15.0M
  note_group_empty = FALSE;
6234
15.0M
  skipunits = 0;         /* Default value for most subgroups */
6235
6236
15.0M
  switch(meta)
6237
15.0M
    {
6238
    /* ===================================================================*/
6239
    /* The branch terminates at pattern end or | or ) */
6240
6241
151k
    case META_END:
6242
1.02M
    case META_ALT:
6243
2.13M
    case META_KET:
6244
2.13M
    *firstcuptr = firstcu;
6245
2.13M
    *firstcuflagsptr = firstcuflags;
6246
2.13M
    *reqcuptr = reqcu;
6247
2.13M
    *reqcuflagsptr = reqcuflags;
6248
2.13M
    *codeptr = code;
6249
2.13M
    *pptrptr = pptr;
6250
2.13M
    return okreturn;
6251
6252
6253
    /* ===================================================================*/
6254
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6255
    the setting of any following char as a first character. */
6256
6257
97.1k
    case META_CIRCUMFLEX:
6258
97.1k
    if ((options & PCRE2_MULTILINE) != 0)
6259
23.8k
      {
6260
23.8k
      if (firstcuflags == REQ_UNSET)
6261
2.48k
        zerofirstcuflags = firstcuflags = REQ_NONE;
6262
23.8k
      *code++ = OP_CIRCM;
6263
23.8k
      }
6264
73.2k
    else *code++ = OP_CIRC;
6265
97.1k
    break;
6266
6267
108k
    case META_DOLLAR:
6268
108k
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6269
108k
    break;
6270
6271
    /* There can never be a first char if '.' is first, whatever happens about
6272
    repeats. The value of reqcu doesn't change either. */
6273
6274
172k
    case META_DOT:
6275
172k
    matched_char = TRUE;
6276
172k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6277
172k
    zerofirstcu = firstcu;
6278
172k
    zerofirstcuflags = firstcuflags;
6279
172k
    zeroreqcu = reqcu;
6280
172k
    zeroreqcuflags = reqcuflags;
6281
172k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6282
172k
    break;
6283
6284
6285
    /* ===================================================================*/
6286
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6287
    Otherwise, an initial ']' is taken as a data character. When empty classes
6288
    are allowed, [] must generate an empty class - we have no dedicated opcode
6289
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6290
    construct would be a clearer way for a pattern author to represent a
6291
    non-matching branch, but it does have different semantics to '[]' if both
6292
    are followed by a quantifier). The empty-negated [^] matches any character,
6293
    so is useful: generate OP_ALLANY for this. */
6294
6295
3.70k
    case META_CLASS_EMPTY:
6296
5.58k
    case META_CLASS_EMPTY_NOT:
6297
5.58k
    matched_char = TRUE;
6298
5.58k
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6299
3.70k
    else
6300
3.70k
      {
6301
3.70k
      *code++ = OP_CLASS;
6302
3.70k
      memset(code, 0, 32);
6303
3.70k
      code += 32 / sizeof(PCRE2_UCHAR);
6304
3.70k
      }
6305
6306
5.58k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6307
5.58k
    zerofirstcu = firstcu;
6308
5.58k
    zerofirstcuflags = firstcuflags;
6309
5.58k
    break;
6310
6311
6312
    /* ===================================================================*/
6313
    /* Non-empty character class. If the included characters are all < 256, we
6314
    build a 32-byte bitmap of the permitted characters, except in the special
6315
    case where there is only one such character. For negated classes, we build
6316
    the map as usual, then invert it at the end. However, we use a different
6317
    opcode so that data characters > 255 can be handled correctly.
6318
6319
    If the class contains characters outside the 0-255 range, a different
6320
    opcode is compiled. It may optionally have a bit map for characters < 256,
6321
    but those above are explicitly listed afterwards. A flag code unit tells
6322
    whether the bitmap is present, and whether this is a negated class or
6323
    not. */
6324
6325
103k
    case META_CLASS_NOT:
6326
274k
    case META_CLASS:
6327
274k
    matched_char = TRUE;
6328
6329
    /* Check for complex extended classes and handle them separately. */
6330
6331
274k
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6332
16.9k
      {
6333
16.9k
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6334
16.9k
                                      errorcodeptr, cb, lengthptr))
6335
0
        return 0;
6336
16.9k
      goto CLASS_END_PROCESSING;
6337
16.9k
      }
6338
6339
    /* We can optimize the case of a single character in a class by generating
6340
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6341
    negative. In the negative case there can be no first char if this item is
6342
    first, whatever repeat count may follow. In the case of reqcu, save the
6343
    previous value for reinstating. */
6344
6345
    /* NOTE: at present this optimization is not effective if the only
6346
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6347
6348
257k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6349
60.6k
      {
6350
60.6k
      uint32_t c = pptr[1];
6351
6352
60.6k
      pptr += 2;                 /* Move on to class end */
6353
60.6k
      if (meta == META_CLASS)    /* A positive one-char class can be */
6354
9.57k
        {                        /* handled as a normal literal character. */
6355
9.57k
        meta = c;                /* Set up the character */
6356
9.57k
        goto NORMAL_CHAR_SET;
6357
9.57k
        }
6358
6359
      /* Handle a negative one-character class */
6360
6361
51.1k
      zeroreqcu = reqcu;
6362
51.1k
      zeroreqcuflags = reqcuflags;
6363
51.1k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6364
51.1k
      zerofirstcu = firstcu;
6365
51.1k
      zerofirstcuflags = firstcuflags;
6366
6367
      /* For caseless UTF or UCP mode, check whether this character has more
6368
      than one other case. If so, generate a special OP_NOTPROP item instead of
6369
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6370
      caseless set that starts with an ASCII character. If the character is
6371
      affected by the special Turkish rules, hardcode the not-matching
6372
      characters using a caseset. */
6373
6374
51.1k
#ifdef SUPPORT_UNICODE
6375
51.1k
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6376
13.6k
        {
6377
13.6k
        uint32_t caseset;
6378
6379
13.6k
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6380
13.6k
              PCRE2_EXTRA_TURKISH_CASING &&
6381
0
            UCD_ANY_I(c))
6382
0
          {
6383
0
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6384
0
          }
6385
13.6k
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6386
985
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6387
266
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6388
200
          {
6389
200
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6390
200
          }
6391
6392
13.6k
        if (caseset != 0)
6393
785
          {
6394
785
          *code++ = OP_NOTPROP;
6395
785
          *code++ = PT_CLIST;
6396
785
          *code++ = caseset;
6397
785
          break;   /* We are finished with this class */
6398
785
          }
6399
13.6k
        }
6400
50.3k
#endif
6401
      /* Char has only one other (usable) case, or UCP not available */
6402
6403
50.3k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6404
50.3k
      code += PUTCHAR(c, code);
6405
50.3k
      break;   /* We are finished with this class */
6406
51.1k
      }        /* End of 1-char optimization */
6407
6408
    /* Handle character classes that contain more than just one literal
6409
    character. If there are exactly two characters in a positive class, see if
6410
    they are case partners. This can be optimized to generate a caseless single
6411
    character match (which also sets first/required code units if relevant).
6412
    When casing restrictions apply, ignore a caseless set if both characters
6413
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6414
    Unicode "othercase". */
6415
6416
196k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6417
119k
        pptr[3] == META_CLASS_END)
6418
31.0k
      {
6419
31.0k
      uint32_t c = pptr[1];
6420
6421
31.0k
#ifdef SUPPORT_UNICODE
6422
31.0k
      if ((UCD_CASESET(c) == 0 ||
6423
2.15k
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6424
586
            c < 128 && pptr[2] < 128)) &&
6425
29.0k
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6426
29.0k
              PCRE2_EXTRA_TURKISH_CASING &&
6427
0
            UCD_ANY_I(c)))
6428
29.0k
#endif
6429
29.0k
        {
6430
29.0k
        uint32_t d;
6431
6432
29.0k
#ifdef SUPPORT_UNICODE
6433
29.0k
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6434
28.2k
#endif
6435
28.2k
          {
6436
#if PCRE2_CODE_UNIT_WIDTH != 8
6437
          if (c > 255) d = c; else
6438
#endif
6439
28.2k
          d = TABLE_GET(c, cb->fcc, c);
6440
28.2k
          }
6441
6442
29.0k
        if (c != d && pptr[2] == d)
6443
700
          {
6444
700
          pptr += 3;                 /* Move on to class end */
6445
700
          meta = c;
6446
700
          if ((options & PCRE2_CASELESS) == 0)
6447
480
            {
6448
480
            reset_caseful = TRUE;
6449
480
            options |= PCRE2_CASELESS;
6450
480
            req_caseopt = REQ_CASELESS;
6451
480
            }
6452
700
          goto CLASS_CASELESS_CHAR;
6453
700
          }
6454
29.0k
        }
6455
31.0k
      }
6456
6457
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6458
6459
196k
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6460
196k
                                          &code, meta == META_CLASS_NOT, NULL,
6461
196k
                                          errorcodeptr, cb, lengthptr);
6462
196k
    if (pptr == NULL) return 0;
6463
196k
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6464
6465
213k
    CLASS_END_PROCESSING:
6466
6467
    /* If this class is the first thing in the branch, there can be no first
6468
    char setting, whatever the repeat count. Any reqcu setting must remain
6469
    unchanged after any kind of repeat. */
6470
6471
213k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6472
213k
    zerofirstcu = firstcu;
6473
213k
    zerofirstcuflags = firstcuflags;
6474
213k
    zeroreqcu = reqcu;
6475
213k
    zeroreqcuflags = reqcuflags;
6476
213k
    break;  /* End of class processing */
6477
6478
6479
    /* ===================================================================*/
6480
    /* Deal with (*VERB)s. */
6481
6482
    /* Check for open captures before ACCEPT and close those that are within
6483
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6484
    assertion. In the first pass, just accumulate the length required;
6485
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6486
    workspace overflow. Do not set firstcu after *ACCEPT. */
6487
6488
10.2k
    case META_ACCEPT:
6489
10.2k
    cb->had_accept = had_accept = TRUE;
6490
10.2k
    for (oc = open_caps;
6491
80.4k
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6492
70.2k
         oc = oc->next)
6493
70.2k
      {
6494
70.2k
      if (lengthptr != NULL)
6495
35.1k
        {
6496
35.1k
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6497
35.1k
        }
6498
35.0k
      else
6499
35.0k
        {
6500
35.0k
        *code++ = OP_CLOSE;
6501
35.0k
        PUT2INC(code, 0, oc->number);
6502
35.0k
        }
6503
70.2k
      }
6504
10.2k
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6505
10.2k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6506
10.2k
    break;
6507
6508
8.35k
    case META_PRUNE:
6509
12.7k
    case META_SKIP:
6510
12.7k
    cb->had_pruneorskip = TRUE;
6511
12.7k
    PCRE2_FALLTHROUGH /* Fall through */
6512
14.6k
    case META_COMMIT:
6513
17.9k
    case META_FAIL:
6514
17.9k
    *code++ = verbops[(meta - META_MARK) >> 16];
6515
17.9k
    break;
6516
6517
14.7k
    case META_THEN:
6518
14.7k
    cb->external_flags |= PCRE2_HASTHEN;
6519
14.7k
    *code++ = OP_THEN;
6520
14.7k
    break;
6521
6522
    /* Handle verbs with arguments. Arguments can be very long, especially in
6523
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6524
    However, the argument length is constrained to be small enough to fit in
6525
    one code unit. This check happens in parse_regex(). In the first pass,
6526
    instead of putting the argument into memory, we just update the length
6527
    counter and set up an empty argument. */
6528
6529
2.09k
    case META_THEN_ARG:
6530
2.09k
    cb->external_flags |= PCRE2_HASTHEN;
6531
2.09k
    goto VERB_ARG;
6532
6533
1.20k
    case META_PRUNE_ARG:
6534
4.30k
    case META_SKIP_ARG:
6535
4.30k
    cb->had_pruneorskip = TRUE;
6536
4.30k
    PCRE2_FALLTHROUGH /* Fall through */
6537
6.44k
    case META_MARK:
6538
7.81k
    case META_COMMIT_ARG:
6539
9.90k
    VERB_ARG:
6540
9.90k
    *code++ = verbops[(meta - META_MARK) >> 16];
6541
    /* The length is in characters. */
6542
9.90k
    verbarglen = *(++pptr);
6543
9.90k
    verbculen = 0;
6544
9.90k
    tempcode = code++;
6545
69.8k
    for (int i = 0; i < (int)verbarglen; i++)
6546
59.9k
      {
6547
59.9k
      meta = *(++pptr);
6548
59.9k
#ifdef SUPPORT_UNICODE
6549
59.9k
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6550
49.5k
#endif
6551
49.5k
        {
6552
49.5k
        mclength = 1;
6553
49.5k
        mcbuffer[0] = meta;
6554
49.5k
        }
6555
59.9k
      if (lengthptr != NULL) *lengthptr += mclength; else
6556
29.8k
        {
6557
29.8k
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6558
29.8k
        code += mclength;
6559
29.8k
        verbculen += mclength;
6560
29.8k
        }
6561
59.9k
      }
6562
6563
9.90k
    *tempcode = verbculen;   /* Fill in the code unit length */
6564
9.90k
    *code++ = 0;             /* Terminating zero */
6565
9.90k
    break;
6566
6567
6568
    /* ===================================================================*/
6569
    /* Handle options change. The new setting must be passed back for use in
6570
    subsequent branches. Reset the greedy defaults and the case value for
6571
    firstcu and reqcu. */
6572
6573
4.25k
    case META_OPTIONS:
6574
4.25k
    *optionsptr = options = *(++pptr);
6575
4.25k
    *xoptionsptr = xoptions = *(++pptr);
6576
4.25k
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6577
4.25k
    greedy_non_default = greedy_default ^ 1;
6578
4.25k
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6579
4.25k
    break;
6580
6581
    /* ===================================================================*/
6582
    /* Handle scan substring. Scan substring assertion starts with META_SCS,
6583
    which recursively calls compile_branch. The first opcode processed by
6584
    this recursive call is always META_OFFSET. */
6585
6586
2.51k
    case META_OFFSET:
6587
2.51k
    if (lengthptr != NULL)
6588
1.31k
      {
6589
1.31k
      pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);
6590
1.31k
      if (pptr == NULL)
6591
54
        return 0;
6592
1.26k
      break;
6593
1.31k
      }
6594
6595
3.94k
    while (TRUE)
6596
3.94k
      {
6597
3.94k
      int count, index;
6598
3.94k
      named_group *ng;
6599
6600
3.94k
      switch (META_CODE(*pptr))
6601
3.94k
        {
6602
1.19k
        case META_OFFSET:
6603
1.19k
        pptr++;
6604
1.19k
        SKIPOFFSET(pptr);
6605
1.19k
        continue;
6606
6607
318
        case META_CAPTURE_NAME:
6608
318
        ng = cb->named_groups + pptr[1];
6609
318
        pptr += 2;
6610
318
        count = 0;
6611
318
        index = 0;
6612
6613
318
        if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6614
318
          &count, errorcodeptr, cb)) return 0;
6615
6616
318
        code[0] = OP_DNCREF;
6617
318
        PUT2(code, 1, index);
6618
318
        PUT2(code, 1 + IMM2_SIZE, count);
6619
318
        code += 1 + 2 * IMM2_SIZE;
6620
318
        continue;
6621
6622
1.23k
        case META_CAPTURE_NUMBER:
6623
1.23k
        pptr += 2;
6624
1.23k
        if (pptr[-1] == 0) continue;
6625
6626
1.16k
        code[0] = OP_CREF;
6627
1.16k
        PUT2(code, 1, pptr[-1]);
6628
1.16k
        code += 1 + IMM2_SIZE;
6629
1.16k
        continue;
6630
6631
1.19k
        default:
6632
1.19k
        break;
6633
3.94k
        }
6634
6635
1.19k
      break;
6636
3.94k
      }
6637
1.19k
    --pptr;
6638
1.19k
    break;
6639
6640
2.51k
    case META_SCS:
6641
2.51k
    bravalue = OP_ASSERT_SCS;
6642
2.51k
    cb->assert_depth += 1;
6643
2.51k
    goto GROUP_PROCESS;
6644
6645
6646
    /* ===================================================================*/
6647
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6648
    because it could be a numerical check on recursion, or a name check on a
6649
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6650
    we can handle it either way. We first try for a name; if not found, process
6651
    the number. */
6652
6653
6.11k
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6654
7.71k
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6655
11.2k
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6656
11.2k
    bravalue = OP_COND;
6657
6658
11.2k
    if (lengthptr != NULL)
6659
6.13k
      {
6660
6.13k
      uint32_t i;
6661
6.13k
      PCRE2_SPTR name;
6662
6.13k
      named_group *ng;
6663
6.13k
      uint32_t *start_pptr = pptr;
6664
6.13k
      uint32_t length = *(++pptr);
6665
6666
6.13k
      GETPLUSOFFSET(offset, pptr);
6667
6.13k
      name = cb->start_pattern + offset;
6668
6669
      /* In the first pass, the names generated in the pre-pass are available,
6670
      but the main name table has not yet been created. Scan the list of names
6671
      generated in the pre-pass in order to get a number and whether or not
6672
      this name is duplicated. If it is not duplicated, we can handle it as a
6673
      numerical group. */
6674
6675
6.13k
      ng = PRIV(compile_find_named_group)(name, length, cb);
6676
6677
6.13k
      if (ng == NULL)
6678
3.34k
        {
6679
        /* If the name was not found we have a bad reference, unless we are
6680
        dealing with R<digits>, which is treated as a recursion test by
6681
        number. */
6682
6683
3.34k
        groupnumber = 0;
6684
3.34k
        if (meta == META_COND_RNUMBER)
6685
3.26k
          {
6686
5.82k
          for (i = 1; i < length; i++)
6687
2.56k
            {
6688
2.56k
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6689
2.56k
            if (groupnumber > MAX_GROUP_NUMBER)
6690
10
              {
6691
10
              *errorcodeptr = ERR61;
6692
10
              cb->erroroffset = offset + i;
6693
10
              return 0;
6694
10
              }
6695
2.56k
            }
6696
3.26k
          }
6697
6698
3.33k
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6699
89
          {
6700
89
          *errorcodeptr = ERR15;
6701
89
          cb->erroroffset = offset;
6702
89
          return 0;
6703
89
          }
6704
6705
        /* (?Rdigits) treated as a recursion reference by number. A value of
6706
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6707
        translated into RREF_ANY (which is 0xffff). */
6708
6709
3.24k
        if (groupnumber == 0) groupnumber = RREF_ANY;
6710
3.24k
        PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);
6711
3.24k
        start_pptr[1] = groupnumber;
6712
3.24k
        skipunits = 1+IMM2_SIZE;
6713
3.24k
        goto GROUP_PROCESS_NOTE_EMPTY;
6714
3.33k
        }
6715
6716
      /* From here on, we know we have a name (not a number),
6717
      so treat META_COND_RNUMBER the same as META_COND_NAME. */
6718
2.79k
      if (meta == META_COND_RNUMBER) meta = META_COND_NAME;
6719
6720
2.79k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
6721
772
        {
6722
        /* Found a non-duplicated name. Since it is a global,
6723
        it is enough to update it in the pre-processing phase. */
6724
772
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6725
6726
772
        start_pptr[0] = meta;
6727
772
        start_pptr[1] = ng->number;
6728
6729
772
        skipunits = 1 + IMM2_SIZE;
6730
772
        goto GROUP_PROCESS_NOTE_EMPTY;
6731
772
        }
6732
6733
      /* We have a duplicated name. In the compile pass we have to search the
6734
      main table in order to get the index and count values. */
6735
6736
2.02k
      start_pptr[0] = meta | 1;
6737
2.02k
      start_pptr[1] = (uint32_t)(ng - cb->named_groups);
6738
6739
      /* A duplicated name was found. Note that if an R<digits> name is found
6740
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6741
2.02k
      skipunits = 1 + 2 * IMM2_SIZE;
6742
2.02k
      }
6743
5.07k
    else
6744
5.07k
      {
6745
      /* Otherwise lengthptr equals to NULL,
6746
      which is the second phase of compilation. */
6747
5.07k
      int count, index;
6748
5.07k
      named_group *ng;
6749
6750
      /* Generate code using the data
6751
      collected in the pre-processing phase. */
6752
6753
5.07k
      if (meta == META_COND_RNUMBER)
6754
2.45k
        {
6755
2.45k
        code[1+LINK_SIZE] = OP_RREF;
6756
2.45k
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6757
2.45k
        skipunits = 1 + IMM2_SIZE;
6758
2.45k
        pptr += 1 + SIZEOFFSET;
6759
2.45k
        goto GROUP_PROCESS_NOTE_EMPTY;
6760
2.45k
        }
6761
6762
2.62k
      if (meta_arg == 0)
6763
641
        {
6764
641
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6765
641
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6766
641
        skipunits = 1 + IMM2_SIZE;
6767
641
        pptr += 1 + SIZEOFFSET;
6768
641
        goto GROUP_PROCESS_NOTE_EMPTY;
6769
641
        }
6770
6771
1.98k
      ng = cb->named_groups + pptr[1];
6772
1.98k
      count = 0;  /* Values for first pass (avoids compiler warning) */
6773
1.98k
      index = 0;
6774
6775
      /* The failed case is an internal error. */
6776
1.98k
      if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6777
1.98k
            &count, errorcodeptr, cb)) return 0;
6778
6779
      /* A duplicated name was found. Note that if an R<digits> name is found
6780
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6781
6782
1.98k
      code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6783
6784
      /* Insert appropriate data values. */
6785
1.98k
      PUT2(code, 2 + LINK_SIZE, index);
6786
1.98k
      PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);
6787
1.98k
      skipunits = 1 + 2 * IMM2_SIZE;
6788
1.98k
      pptr += 1 + SIZEOFFSET;
6789
1.98k
      }
6790
6791
4.00k
    PCRE2_ASSERT(meta != META_CAPTURE_NAME);
6792
4.00k
    goto GROUP_PROCESS_NOTE_EMPTY;
6793
6794
    /* The DEFINE condition is always false. Its internal groups may never
6795
    be called, so matched_char must remain false, hence the jump to
6796
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6797
6798
427
    case META_COND_DEFINE:
6799
427
    bravalue = OP_COND;
6800
427
    GETPLUSOFFSET(offset, pptr);
6801
427
    code[1+LINK_SIZE] = OP_DEFINE;
6802
427
    skipunits = 1;
6803
427
    goto GROUP_PROCESS;
6804
6805
    /* Conditional test of a group's being set. */
6806
6807
553
    case META_COND_NUMBER:
6808
553
    bravalue = OP_COND;
6809
553
    GETPLUSOFFSET(offset, pptr);
6810
6811
553
    groupnumber = *(++pptr);
6812
553
    if (groupnumber > cb->bracount)
6813
26
      {
6814
26
      *errorcodeptr = ERR15;
6815
26
      cb->erroroffset = offset;
6816
26
      return 0;
6817
26
      }
6818
527
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6819
6820
    /* Point at initial ( for too many branches error */
6821
527
    offset -= 2;
6822
527
    code[1+LINK_SIZE] = OP_CREF;
6823
527
    skipunits = 1+IMM2_SIZE;
6824
527
    PUT2(code, 2+LINK_SIZE, groupnumber);
6825
527
    goto GROUP_PROCESS_NOTE_EMPTY;
6826
6827
    /* Test for the PCRE2 version. */
6828
6829
2.34k
    case META_COND_VERSION:
6830
2.34k
    bravalue = OP_COND;
6831
2.34k
    if (pptr[1] > 0)
6832
857
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6833
599
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6834
751
          OP_TRUE : OP_FALSE;
6835
1.48k
    else
6836
1.48k
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6837
1.48k
        OP_TRUE : OP_FALSE;
6838
2.34k
    skipunits = 1;
6839
2.34k
    pptr += 3;
6840
2.34k
    goto GROUP_PROCESS_NOTE_EMPTY;
6841
6842
    /* The condition is an assertion, possibly preceded by a callout. */
6843
6844
18.1k
    case META_COND_ASSERT:
6845
18.1k
    bravalue = OP_COND;
6846
18.1k
    goto GROUP_PROCESS_NOTE_EMPTY;
6847
6848
6849
    /* ===================================================================*/
6850
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6851
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6852
6853
27.4k
    case META_LOOKAHEAD:
6854
27.4k
    bravalue = OP_ASSERT;
6855
27.4k
    cb->assert_depth += 1;
6856
27.4k
    goto GROUP_PROCESS;
6857
6858
13.1k
    case META_LOOKAHEAD_NA:
6859
13.1k
    bravalue = OP_ASSERT_NA;
6860
13.1k
    cb->assert_depth += 1;
6861
13.1k
    goto GROUP_PROCESS;
6862
6863
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6864
    thing to do, but Perl allows all assertions to be quantified, and when
6865
    they contain capturing parentheses there may be a potential use for
6866
    this feature. Not that that applies to a quantified (?!) but we allow
6867
    it for uniformity. */
6868
6869
17.9k
    case META_LOOKAHEADNOT:
6870
17.9k
    if (pptr[1] == META_KET &&
6871
2.04k
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6872
1.32k
      {
6873
1.32k
      *code++ = OP_FAIL;
6874
1.32k
      pptr++;
6875
1.32k
      }
6876
16.6k
    else
6877
16.6k
      {
6878
16.6k
      bravalue = OP_ASSERT_NOT;
6879
16.6k
      cb->assert_depth += 1;
6880
16.6k
      goto GROUP_PROCESS;
6881
16.6k
      }
6882
1.32k
    break;
6883
6884
7.53k
    case META_LOOKBEHIND:
6885
7.53k
    bravalue = OP_ASSERTBACK;
6886
7.53k
    cb->assert_depth += 1;
6887
7.53k
    goto GROUP_PROCESS;
6888
6889
9.37k
    case META_LOOKBEHINDNOT:
6890
9.37k
    bravalue = OP_ASSERTBACK_NOT;
6891
9.37k
    cb->assert_depth += 1;
6892
9.37k
    goto GROUP_PROCESS;
6893
6894
2.96k
    case META_LOOKBEHIND_NA:
6895
2.96k
    bravalue = OP_ASSERTBACK_NA;
6896
2.96k
    cb->assert_depth += 1;
6897
2.96k
    goto GROUP_PROCESS;
6898
6899
5.09k
    case META_ATOMIC:
6900
5.09k
    bravalue = OP_ONCE;
6901
5.09k
    goto GROUP_PROCESS_NOTE_EMPTY;
6902
6903
2.02k
    case META_SCRIPT_RUN:
6904
2.02k
    bravalue = OP_SCRIPT_RUN;
6905
2.02k
    goto GROUP_PROCESS_NOTE_EMPTY;
6906
6907
77.2k
    case META_NOCAPTURE:
6908
77.2k
    bravalue = OP_BRA;
6909
    /* Fall through */
6910
6911
    /* Process nested bracketed regex. The nesting depth is maintained for the
6912
    benefit of the stackguard function. The test for too deep nesting is now
6913
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6914
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6915
    note of whether or not they may match an empty string. */
6916
6917
1.03M
    GROUP_PROCESS_NOTE_EMPTY:
6918
1.03M
    note_group_empty = TRUE;
6919
6920
1.11M
    GROUP_PROCESS:
6921
1.11M
    cb->parens_depth += 1;
6922
1.11M
    *code = bravalue;
6923
1.11M
    pptr++;
6924
1.11M
    tempcode = code;
6925
1.11M
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6926
1.11M
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6927
6928
1.11M
    if ((group_return =
6929
1.11M
         compile_regex(
6930
1.11M
         options,                         /* The options state */
6931
1.11M
         xoptions,                        /* The extra options state */
6932
1.11M
         &tempcode,                       /* Where to put code (updated) */
6933
1.11M
         &pptr,                           /* Input pointer (updated) */
6934
1.11M
         errorcodeptr,                    /* Where to put an error message */
6935
1.11M
         skipunits,                       /* Skip over bracket number */
6936
1.11M
         &subfirstcu,                     /* For possible first char */
6937
1.11M
         &subfirstcuflags,
6938
1.11M
         &subreqcu,                       /* For possible last char */
6939
1.11M
         &subreqcuflags,
6940
1.11M
         bcptr,                           /* Current branch chain */
6941
1.11M
         open_caps,                       /* Pointer to capture stack */
6942
1.11M
         cb,                              /* Compile data block */
6943
1.11M
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6944
1.11M
           &length_prevgroup              /* Pre-compile phase */
6945
1.11M
         )) == 0)
6946
1.18k
      return 0;  /* Error */
6947
6948
1.11M
    cb->parens_depth -= 1;
6949
6950
    /* If that was a non-conditional significant group (not an assertion, not a
6951
    DEFINE) that matches at least one character, then the current item matches
6952
    a character. Conditionals are handled below. */
6953
6954
1.11M
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6955
199k
      matched_char = TRUE;
6956
6957
    /* If we've just compiled an assertion, pop the assert depth. */
6958
6959
1.11M
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6960
79.3k
      cb->assert_depth -= 1;
6961
6962
    /* At the end of compiling, code is still pointing to the start of the
6963
    group, while tempcode has been updated to point past the end of the group.
6964
    The parsed pattern pointer (pptr) is on the closing META_KET.
6965
6966
    If this is a conditional bracket, check that there are no more than
6967
    two branches in the group, or just one if it's a DEFINE group. We do this
6968
    in the real compile phase, not in the pre-pass, where the whole group may
6969
    not be available. */
6970
6971
1.11M
    if (bravalue == OP_COND && lengthptr == NULL)
6972
15.7k
      {
6973
15.7k
      PCRE2_UCHAR *tc = code;
6974
15.7k
      int condcount = 0;
6975
6976
19.3k
      do {
6977
19.3k
         condcount++;
6978
19.3k
         tc += GET(tc,1);
6979
19.3k
         }
6980
19.3k
      while (*tc != OP_KET);
6981
6982
      /* A DEFINE group is never obeyed inline (the "condition" is always
6983
      false). It must have only one branch. Having checked this, change the
6984
      opcode to OP_FALSE. */
6985
6986
15.7k
      if (code[LINK_SIZE+1] == OP_DEFINE)
6987
205
        {
6988
205
        if (condcount > 1)
6989
6
          {
6990
6
          cb->erroroffset = offset;
6991
6
          *errorcodeptr = ERR54;
6992
6
          return 0;
6993
6
          }
6994
199
        code[LINK_SIZE+1] = OP_FALSE;
6995
199
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6996
199
        }
6997
6998
      /* A "normal" conditional group. If there is just one branch, we must not
6999
      make use of its firstcu or reqcu, because this is equivalent to an
7000
      empty second branch. Also, it may match an empty string. If there are two
7001
      branches, this item must match a character if the group must. */
7002
7003
15.5k
      else
7004
15.5k
        {
7005
15.5k
        if (condcount > 2)
7006
51
          {
7007
51
          cb->erroroffset = offset;
7008
51
          *errorcodeptr = ERR27;
7009
51
          return 0;
7010
51
          }
7011
15.4k
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7012
3.05k
          else if (group_return > 0) matched_char = TRUE;
7013
15.4k
        }
7014
15.7k
      }
7015
7016
    /* In the pre-compile phase, update the length by the length of the group,
7017
    less the brackets at either end. Then reduce the compiled code to just a
7018
    set of non-capturing brackets so that it doesn't use much memory if it is
7019
    duplicated by a quantifier.*/
7020
7021
1.11M
    if (lengthptr != NULL)
7022
563k
      {
7023
563k
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7024
0
        {
7025
0
        *errorcodeptr = ERR20;
7026
0
        return 0;
7027
0
        }
7028
563k
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7029
563k
      code++;   /* This already contains bravalue */
7030
563k
      PUTINC(code, 0, 1 + LINK_SIZE);
7031
563k
      *code++ = OP_KET;
7032
563k
      PUTINC(code, 0, 1 + LINK_SIZE);
7033
563k
      break;    /* No need to waste time with special character handling */
7034
563k
      }
7035
7036
    /* Otherwise update the main code pointer to the end of the group. */
7037
7038
550k
    code = tempcode;
7039
7040
    /* For a DEFINE group, required and first character settings are not
7041
    relevant. */
7042
7043
550k
    if (bravalue == OP_DEFINE) break;
7044
7045
    /* Handle updating of the required and first code units for other types of
7046
    group. Update for normal brackets of all kinds, and conditions with two
7047
    branches (see code above). If the bracket is followed by a quantifier with
7048
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
7049
    zerofirstcu outside the main loop so that they can be accessed for the back
7050
    off. */
7051
7052
550k
    zeroreqcu = reqcu;
7053
550k
    zeroreqcuflags = reqcuflags;
7054
550k
    zerofirstcu = firstcu;
7055
550k
    zerofirstcuflags = firstcuflags;
7056
550k
    groupsetfirstcu = FALSE;
7057
7058
550k
    if (bravalue >= OP_ONCE)  /* Not an assertion */
7059
510k
      {
7060
      /* If we have not yet set a firstcu in this branch, take it from the
7061
      subpattern, remembering that it was set here so that a repeat of more
7062
      than one can replicate it as reqcu if necessary. If the subpattern has
7063
      no firstcu, set "none" for the whole branch. In both cases, a zero
7064
      repeat forces firstcu to "none". */
7065
7066
510k
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7067
65.3k
        {
7068
65.3k
        if (subfirstcuflags < REQ_NONE)
7069
14.5k
          {
7070
14.5k
          firstcu = subfirstcu;
7071
14.5k
          firstcuflags = subfirstcuflags;
7072
14.5k
          groupsetfirstcu = TRUE;
7073
14.5k
          }
7074
50.8k
        else firstcuflags = REQ_NONE;
7075
65.3k
        zerofirstcuflags = REQ_NONE;
7076
65.3k
        }
7077
7078
      /* If firstcu was previously set, convert the subpattern's firstcu
7079
      into reqcu if there wasn't one, using the vary flag that was in
7080
      existence beforehand. */
7081
7082
445k
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
7083
6.28k
        {
7084
6.28k
        subreqcu = subfirstcu;
7085
6.28k
        subreqcuflags = subfirstcuflags | tempreqvary;
7086
6.28k
        }
7087
7088
      /* If the subpattern set a required code unit (or set a first code unit
7089
      that isn't really the first code unit - see above), set it. */
7090
7091
510k
      if (subreqcuflags < REQ_NONE)
7092
77.6k
        {
7093
77.6k
        reqcu = subreqcu;
7094
77.6k
        reqcuflags = subreqcuflags;
7095
77.6k
        }
7096
510k
      }
7097
7098
    /* For a forward assertion, we take the reqcu, if set, provided that the
7099
    group has also set a firstcu. This can be helpful if the pattern that
7100
    follows the assertion doesn't set a different char. For example, it's
7101
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7102
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7103
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7104
    overcome by a scan at the end if there's no firstcu, looking for an
7105
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7106
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7107
    in that example, 'X' ends up set for both. */
7108
7109
39.1k
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7110
20.0k
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7111
4.72k
      {
7112
4.72k
      reqcu = subreqcu;
7113
4.72k
      reqcuflags = subreqcuflags;
7114
4.72k
      }
7115
7116
550k
    break;  /* End of nested group handling */
7117
7118
7119
    /* ===================================================================*/
7120
    /* Handle named backreferences and recursions. */
7121
7122
9.99k
    case META_BACKREF_BYNAME:
7123
10.3k
    case META_RECURSE_BYNAME:
7124
10.3k
      {
7125
10.3k
      int count, index;
7126
10.3k
      PCRE2_SPTR name;
7127
10.3k
      named_group *ng;
7128
10.3k
      uint32_t length = *(++pptr);
7129
7130
10.3k
      GETPLUSOFFSET(offset, pptr);
7131
10.3k
      name = cb->start_pattern + offset;
7132
7133
      /* In the first pass, the names generated in the pre-pass are available,
7134
      but the main name table has not yet been created. Scan the list of names
7135
      generated in the pre-pass in order to get a number and whether or not
7136
      this name is duplicated. */
7137
7138
10.3k
      ng = PRIV(compile_find_named_group)(name, length, cb);
7139
7140
10.3k
      if (ng == NULL)
7141
73
        {
7142
        /* If the name was not found we have a bad reference. */
7143
73
        *errorcodeptr = ERR15;
7144
73
        cb->erroroffset = offset;
7145
73
        return 0;
7146
73
        }
7147
7148
10.2k
      groupnumber = ng->number;
7149
7150
      /* For a recursion, that's all that is needed. We can now go to
7151
      the code that handles numerical recursion, applying it to the first
7152
      group with the given name. */
7153
7154
10.2k
      if (meta == META_RECURSE_BYNAME)
7155
284
        {
7156
284
        meta_arg = groupnumber;
7157
284
        goto HANDLE_NUMERICAL_RECURSION;
7158
284
        }
7159
7160
      /* For a back reference, update the back reference map and the
7161
      maximum back reference. */
7162
7163
9.94k
      cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7164
9.94k
      if (groupnumber > cb->top_backref)
7165
845
        cb->top_backref = groupnumber;
7166
7167
      /* If a back reference name is not duplicated, we can handle it as
7168
      a numerical reference. */
7169
7170
9.94k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
7171
957
        {
7172
957
        meta_arg = groupnumber;
7173
957
        goto HANDLE_SINGLE_REFERENCE;
7174
957
        }
7175
7176
      /* If a back reference name is duplicated, we generate a different
7177
      opcode to a numerical back reference. In the second pass we must
7178
      search for the index and count in the final name table. */
7179
7180
8.98k
      count = 0;  /* Values for first pass (avoids compiler warning) */
7181
8.98k
      index = 0;
7182
8.98k
      if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,
7183
4.41k
            &index, &count, errorcodeptr, cb)) return 0;
7184
7185
8.98k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7186
8.98k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7187
8.98k
      PUT2INC(code, 0, index);
7188
8.98k
      PUT2INC(code, 0, count);
7189
8.98k
      if ((options & PCRE2_CASELESS) != 0)
7190
3.02k
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7191
2.98k
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7192
3.02k
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7193
3.02k
                   REFI_FLAG_TURKISH_CASING : 0);
7194
8.98k
      }
7195
0
    break;
7196
7197
7198
    /* ===================================================================*/
7199
    /* Handle a numerical callout. */
7200
7201
1.96M
    case META_CALLOUT_NUMBER:
7202
1.96M
    code[0] = OP_CALLOUT;
7203
1.96M
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7204
1.96M
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7205
1.96M
    code[1 + 2*LINK_SIZE] = pptr[3];
7206
1.96M
    pptr += 3;
7207
1.96M
    code += PRIV(OP_lengths)[OP_CALLOUT];
7208
1.96M
    break;
7209
7210
7211
    /* ===================================================================*/
7212
    /* Handle a callout with a string argument. In the pre-pass we just compute
7213
    the length without generating anything. The length in pptr[3] includes both
7214
    delimiters; in the actual compile only the first one is copied, but a
7215
    terminating zero is added. Any doubled delimiters within the string make
7216
    this an overestimate, but it is not worth bothering about. */
7217
7218
8.10k
    case META_CALLOUT_STRING:
7219
8.10k
    if (lengthptr != NULL)
7220
4.05k
      {
7221
4.05k
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7222
4.05k
      pptr += 3;
7223
4.05k
      SKIPOFFSET(pptr);
7224
4.05k
      }
7225
7226
    /* In the real compile we can copy the string. The starting delimiter is
7227
     included so that the client can discover it if they want. We also pass the
7228
     start offset to help a script language give better error messages. */
7229
7230
4.04k
    else
7231
4.04k
      {
7232
4.04k
      PCRE2_SPTR pp;
7233
4.04k
      uint32_t delimiter;
7234
4.04k
      uint32_t length = pptr[3];
7235
4.04k
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7236
7237
4.04k
      code[0] = OP_CALLOUT_STR;
7238
4.04k
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7239
4.04k
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7240
7241
4.04k
      pptr += 3;
7242
4.04k
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7243
4.04k
      pp = cb->start_pattern + offset;
7244
4.04k
      delimiter = *callout_string++ = *pp++;
7245
4.04k
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7246
1.53k
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7247
4.04k
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7248
7249
      /* The syntax of the pattern was checked in the parsing scan. The length
7250
      includes both delimiters, but we have passed the opening one just above,
7251
      so we reduce length before testing it. The test is for > 1 because we do
7252
      not want to copy the final delimiter. This also ensures that pp[1] is
7253
      accessible. */
7254
7255
11.9k
      while (--length > 1)
7256
7.90k
        {
7257
7.90k
        if (*pp == delimiter && pp[1] == delimiter)
7258
241
          {
7259
241
          *callout_string++ = delimiter;
7260
241
          pp += 2;
7261
241
          length--;
7262
241
          }
7263
7.66k
        else *callout_string++ = *pp++;
7264
7.90k
        }
7265
4.04k
      *callout_string++ = CHAR_NUL;
7266
7267
      /* Set the length of the entire item, the advance to its end. */
7268
7269
4.04k
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7270
4.04k
      code = callout_string;
7271
4.04k
      }
7272
8.10k
    break;
7273
7274
7275
    /* ===================================================================*/
7276
    /* Handle repetition. The different types are all sorted out in the parsing
7277
    pass. */
7278
7279
38.0k
    case META_MINMAX_PLUS:
7280
41.9k
    case META_MINMAX_QUERY:
7281
147k
    case META_MINMAX:
7282
147k
    repeat_min = *(++pptr);
7283
147k
    repeat_max = *(++pptr);
7284
147k
    goto REPEAT;
7285
7286
241k
    case META_ASTERISK:
7287
252k
    case META_ASTERISK_PLUS:
7288
255k
    case META_ASTERISK_QUERY:
7289
255k
    repeat_min = 0;
7290
255k
    repeat_max = REPEAT_UNLIMITED;
7291
255k
    goto REPEAT;
7292
7293
265k
    case META_PLUS:
7294
304k
    case META_PLUS_PLUS:
7295
309k
    case META_PLUS_QUERY:
7296
309k
    repeat_min = 1;
7297
309k
    repeat_max = REPEAT_UNLIMITED;
7298
309k
    goto REPEAT;
7299
7300
147k
    case META_QUERY:
7301
168k
    case META_QUERY_PLUS:
7302
171k
    case META_QUERY_QUERY:
7303
171k
    repeat_min = 0;
7304
171k
    repeat_max = 1;
7305
7306
883k
    REPEAT:
7307
883k
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7308
7309
    /* Remember whether this is a variable length repeat, and default to
7310
    single-char opcodes. */
7311
7312
883k
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7313
7314
    /* Adjust first and required code units for a zero repeat. */
7315
7316
883k
    if (repeat_min == 0)
7317
469k
      {
7318
469k
      firstcu = zerofirstcu;
7319
469k
      firstcuflags = zerofirstcuflags;
7320
469k
      reqcu = zeroreqcu;
7321
469k
      reqcuflags = zeroreqcuflags;
7322
469k
      }
7323
7324
    /* Note the greediness and possessiveness. */
7325
7326
883k
    switch (meta)
7327
883k
      {
7328
38.0k
      case META_MINMAX_PLUS:
7329
49.3k
      case META_ASTERISK_PLUS:
7330
87.7k
      case META_PLUS_PLUS:
7331
108k
      case META_QUERY_PLUS:
7332
108k
      repeat_type = 0;                  /* Force greedy */
7333
108k
      possessive_quantifier = TRUE;
7334
108k
      break;
7335
7336
3.90k
      case META_MINMAX_QUERY:
7337
6.70k
      case META_ASTERISK_QUERY:
7338
11.8k
      case META_PLUS_QUERY:
7339
15.3k
      case META_QUERY_QUERY:
7340
15.3k
      repeat_type = greedy_non_default;
7341
15.3k
      possessive_quantifier = FALSE;
7342
15.3k
      break;
7343
7344
759k
      default:
7345
759k
      repeat_type = greedy_default;
7346
759k
      possessive_quantifier = FALSE;
7347
759k
      break;
7348
883k
      }
7349
7350
    /* Save start of previous item, in case we have to move it up in order to
7351
    insert something before it, and remember what it was. */
7352
7353
883k
    PCRE2_ASSERT(previous != NULL);
7354
883k
    tempcode = previous;
7355
883k
    op_previous = *previous;
7356
7357
    /* Now handle repetition for the different types of item. If the repeat
7358
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7359
    non-parenthesized items, as they have only one alternative. For anything in
7360
    parentheses, we must not ignore if {1} is possessive. */
7361
7362
883k
    switch (op_previous)
7363
883k
      {
7364
      /* If previous was a character or negated character match, abolish the
7365
      item and generate a repeat item instead. If a char item has a minimum of
7366
      more than one, ensure that it is set in reqcu - it might not be if a
7367
      sequence such as x{3} is the first thing in a branch because the x will
7368
      have gone into firstcu instead.  */
7369
7370
276k
      case OP_CHAR:
7371
388k
      case OP_CHARI:
7372
408k
      case OP_NOT:
7373
425k
      case OP_NOTI:
7374
425k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7375
424k
      op_type = chartypeoffset[op_previous - OP_CHAR];
7376
7377
      /* Deal with UTF characters that take up more than one code unit. */
7378
7379
424k
#ifdef MAYBE_UTF_MULTI
7380
424k
      if (utf && NOT_FIRSTCU(code[-1]))
7381
9.42k
        {
7382
9.42k
        PCRE2_UCHAR *lastchar = code - 1;
7383
9.42k
        BACKCHAR(lastchar);
7384
9.42k
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7385
9.42k
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7386
9.42k
        }
7387
415k
      else
7388
415k
#endif  /* MAYBE_UTF_MULTI */
7389
7390
      /* Handle the case of a single code unit - either with no UTF support, or
7391
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7392
      case, for a repeated positive match, get the caseless flag for the
7393
      required code unit from the previous character, because a class like [Aa]
7394
      sets a caseless A but by now the req_caseopt flag has been reset. */
7395
7396
415k
        {
7397
415k
        mcbuffer[0] = code[-1];
7398
415k
        mclength = 1;
7399
415k
        if (op_previous <= OP_CHARI && repeat_min > 1)
7400
18.7k
          {
7401
18.7k
          reqcu = mcbuffer[0];
7402
18.7k
          reqcuflags = cb->req_varyopt;
7403
18.7k
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7404
18.7k
          }
7405
415k
        }
7406
424k
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7407
7408
      /* If previous was a character class or a back reference, we put the
7409
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7410
7411
0
#ifdef SUPPORT_WIDE_CHARS
7412
16.1k
      case OP_XCLASS:
7413
18.8k
      case OP_ECLASS:
7414
18.8k
#endif
7415
64.4k
      case OP_CLASS:
7416
92.6k
      case OP_NCLASS:
7417
97.3k
      case OP_REF:
7418
99.5k
      case OP_REFI:
7419
101k
      case OP_DNREF:
7420
102k
      case OP_DNREFI:
7421
7422
102k
      if (repeat_max == 0)
7423
1.40k
        {
7424
1.40k
        code = previous;
7425
1.40k
        goto END_REPEAT;
7426
1.40k
        }
7427
101k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7428
7429
101k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7430
35.5k
        *code++ = OP_CRSTAR + repeat_type;
7431
65.6k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7432
25.8k
        *code++ = OP_CRPLUS + repeat_type;
7433
39.8k
      else if (repeat_min == 0 && repeat_max == 1)
7434
17.6k
        *code++ = OP_CRQUERY + repeat_type;
7435
22.1k
      else
7436
22.1k
        {
7437
22.1k
        *code++ = OP_CRRANGE + repeat_type;
7438
22.1k
        PUT2INC(code, 0, repeat_min);
7439
22.1k
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7440
22.1k
        PUT2INC(code, 0, repeat_max);
7441
22.1k
        }
7442
101k
      break;
7443
7444
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7445
      because pcre2_match() could not handle backtracking into recursively
7446
      called groups. Now that this backtracking is available, we no longer need
7447
      to do this. However, we still need to replicate recursions as we do for
7448
      groups so as to have independent backtracking points. We can replicate
7449
      for the minimum number of repeats directly. For optional repeats we now
7450
      wrap the recursion in OP_BRA brackets and make use of the bracket
7451
      repetition. */
7452
7453
7.84k
      case OP_RECURSE:
7454
7.84k
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7455
70
        goto END_REPEAT;
7456
7457
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7458
      minimum is 1 and the maximum unlimited, because that can be handled with
7459
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7460
      minimum, we just need to generate the appropriate additional copies.
7461
      Otherwise we need to generate one more, to simulate the situation when
7462
      the minimum is zero. */
7463
7464
7.77k
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7465
2.29k
        {
7466
2.29k
        int replicate = repeat_min;
7467
7468
2.29k
        if (repeat_min == repeat_max) replicate--;
7469
7470
        /* In the pre-compile phase, we don't actually do the replication. We
7471
        just adjust the length as if we had. Do some paranoid checks for
7472
        potential integer overflow. */
7473
7474
2.29k
        if (lengthptr != NULL)
7475
1.20k
          {
7476
1.20k
          PCRE2_SIZE delta;
7477
1.20k
          if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||
7478
1.20k
              OFLOW_MAX - *lengthptr < delta)
7479
0
            {
7480
0
            *errorcodeptr = ERR20;
7481
0
            return 0;
7482
0
            }
7483
1.20k
          *lengthptr += delta;
7484
1.20k
          }
7485
961k
        else for (int i = 0; i < replicate; i++)
7486
959k
          {
7487
959k
          memcpy(code, previous, CU2BYTES(length_prevgroup));
7488
959k
          previous = code;
7489
959k
          code += length_prevgroup;
7490
959k
          }
7491
7492
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7493
        the counts and fall through. */
7494
7495
2.29k
        if (repeat_min == repeat_max) break;
7496
564
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7497
564
        repeat_min = 0;
7498
564
        }
7499
7500
      /* Wrap the recursion call in OP_BRA brackets. */
7501
6.04k
        {
7502
6.04k
        PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;
7503
7504
6.04k
        (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));
7505
6.04k
        op_previous = *previous = OP_BRA;
7506
6.04k
        PUT(previous, 1, 1 + LINK_SIZE + length);
7507
6.04k
        previous[1 + LINK_SIZE + length] = OP_KET;
7508
6.04k
        PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);
7509
6.04k
        }
7510
6.04k
      code += 2 + 2 * LINK_SIZE;
7511
6.04k
      length_prevgroup += 2 + 2 * LINK_SIZE;
7512
6.04k
      group_return = -1;  /* Set "may match empty string" */
7513
7514
      /* Now treat as a repeated OP_BRA. */
7515
6.04k
      PCRE2_FALLTHROUGH /* Fall through */
7516
7517
      /* If previous was a bracket group, we may have to replicate it in
7518
      certain cases. Note that at this point we can encounter only the "basic"
7519
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7520
      converted into the more special varieties such as BRAPOS and SBRA.
7521
      Originally, PCRE did not allow repetition of assertions, but now it does,
7522
      for Perl compatibility. */
7523
7524
13.5k
      case OP_ASSERT:
7525
18.9k
      case OP_ASSERT_NOT:
7526
24.6k
      case OP_ASSERT_NA:
7527
26.6k
      case OP_ASSERTBACK:
7528
29.2k
      case OP_ASSERTBACK_NOT:
7529
30.4k
      case OP_ASSERTBACK_NA:
7530
31.1k
      case OP_ASSERT_SCS:
7531
33.9k
      case OP_ONCE:
7532
34.4k
      case OP_SCRIPT_RUN:
7533
51.4k
      case OP_BRA:
7534
119k
      case OP_CBRA:
7535
130k
      case OP_COND:
7536
130k
        {
7537
130k
        int len = (int)(code - previous);
7538
130k
        PCRE2_UCHAR *bralink = NULL;
7539
130k
        PCRE2_UCHAR *brazeroptr = NULL;
7540
7541
130k
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7542
534
          goto END_REPEAT;
7543
7544
        /* Repeating a DEFINE group (or any group where the condition is always
7545
        FALSE and there is only one branch) is pointless, but Perl allows the
7546
        syntax, so we just ignore the repeat. */
7547
7548
129k
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7549
337
            previous[GET(previous, 1)] != OP_ALT)
7550
205
          goto END_REPEAT;
7551
7552
        /* Perl allows all assertions to be quantified, and when they contain
7553
        capturing parentheses and/or are optional there are potential uses for
7554
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7555
        invalid grounds that further repetition was never useful. This was
7556
        always a bit pointless, since an assertion could be wrapped with a
7557
        repeated group to achieve the effect. General repetition is now
7558
        permitted, but if the maximum is unlimited it is set to one more than
7559
        the minimum. */
7560
7561
129k
        if (op_previous < OP_ONCE)    /* Assertion */
7562
24.9k
          {
7563
24.9k
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7564
24.9k
          }
7565
7566
        /* The case of a zero minimum is special because of the need to stick
7567
        OP_BRAZERO in front of it, and because the group appears once in the
7568
        data, whereas in other cases it appears the minimum number of times. For
7569
        this reason, it is simplest to treat this case separately, as otherwise
7570
        the code gets far too messy. There are several special subcases when the
7571
        minimum is zero. */
7572
7573
129k
        if (repeat_min == 0)
7574
26.9k
          {
7575
          /* If the maximum is also zero, we used to just omit the group from
7576
          the output altogether, like this:
7577
7578
          ** if (repeat_max == 0)
7579
          **   {
7580
          **   code = previous;
7581
          **   goto END_REPEAT;
7582
          **   }
7583
7584
          However, that fails when a group or a subgroup within it is
7585
          referenced as a subroutine from elsewhere in the pattern, so now we
7586
          stick in OP_SKIPZERO in front of it so that it is skipped on
7587
          execution. As we don't have a list of which groups are referenced, we
7588
          cannot do this selectively.
7589
7590
          If the maximum is 1 or unlimited, we just have to stick in the
7591
          BRAZERO and do no more at this point. */
7592
7593
26.9k
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7594
21.0k
            {
7595
21.0k
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7596
21.0k
            code++;
7597
21.0k
            if (repeat_max == 0)
7598
668
              {
7599
668
              *previous++ = OP_SKIPZERO;
7600
668
              goto END_REPEAT;
7601
668
              }
7602
20.3k
            brazeroptr = previous;    /* Save for possessive optimizing */
7603
20.3k
            *previous++ = OP_BRAZERO + repeat_type;
7604
20.3k
            }
7605
7606
          /* If the maximum is greater than 1 and limited, we have to replicate
7607
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7608
          The first one has to be handled carefully because it's the original
7609
          copy, which has to be moved up. The remainder can be handled by code
7610
          that is common with the non-zero minimum case below. We have to
7611
          adjust the value or repeat_max, since one less copy is required. */
7612
7613
5.98k
          else
7614
5.98k
            {
7615
5.98k
            int linkoffset;
7616
5.98k
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7617
5.98k
            code += 2 + LINK_SIZE;
7618
5.98k
            *previous++ = OP_BRAZERO + repeat_type;
7619
5.98k
            *previous++ = OP_BRA;
7620
7621
            /* We chain together the bracket link offset fields that have to be
7622
            filled in later when the ends of the brackets are reached. */
7623
7624
5.98k
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7625
5.98k
            bralink = previous;
7626
5.98k
            PUTINC(previous, 0, linkoffset);
7627
5.98k
            }
7628
7629
26.3k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7630
26.3k
          }
7631
7632
        /* If the minimum is greater than zero, replicate the group as many
7633
        times as necessary, and adjust the maximum to the number of subsequent
7634
        copies that we need. */
7635
7636
102k
        else
7637
102k
          {
7638
102k
          if (repeat_min > 1)
7639
39.9k
            {
7640
            /* In the pre-compile phase, we don't actually do the replication.
7641
            We just adjust the length as if we had. Do some paranoid checks for
7642
            potential integer overflow. */
7643
7644
39.9k
            if (lengthptr != NULL)
7645
20.2k
              {
7646
20.2k
              PCRE2_SIZE delta;
7647
20.2k
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7648
20.2k
                                 (int)length_prevgroup) ||
7649
20.2k
                  OFLOW_MAX - *lengthptr < delta)
7650
25
                {
7651
25
                *errorcodeptr = ERR20;
7652
25
                return 0;
7653
25
                }
7654
20.2k
              *lengthptr += delta;
7655
20.2k
              }
7656
7657
            /* This is compiling for real. If there is a set first code unit
7658
            for the group, and we have not yet set a "required code unit", set
7659
            it. */
7660
7661
19.6k
            else
7662
19.6k
              {
7663
19.6k
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7664
343
                {
7665
343
                reqcu = firstcu;
7666
343
                reqcuflags = firstcuflags;
7667
343
                }
7668
8.32M
              for (uint32_t i = 1; i < repeat_min; i++)
7669
8.30M
                {
7670
8.30M
                memcpy(code, previous, CU2BYTES(len));
7671
8.30M
                code += len;
7672
8.30M
                }
7673
19.6k
              }
7674
39.9k
            }
7675
7676
102k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7677
102k
          }
7678
7679
        /* This code is common to both the zero and non-zero minimum cases. If
7680
        the maximum is limited, it replicates the group in a nested fashion,
7681
        remembering the bracket starts on a stack. In the case of a zero
7682
        minimum, the first one was set up above. In all cases the repeat_max
7683
        now specifies the number of additional copies needed. Again, we must
7684
        remember to replicate entries on the forward reference list. */
7685
7686
129k
        if (repeat_max != REPEAT_UNLIMITED)
7687
59.7k
          {
7688
          /* In the pre-compile phase, we don't actually do the replication. We
7689
          just adjust the length as if we had. For each repetition we must add
7690
          1 to the length for BRAZERO and for all but the last repetition we
7691
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7692
          paranoid checks to avoid integer overflow. */
7693
7694
59.7k
          if (lengthptr != NULL && repeat_max > 0)
7695
5.25k
            {
7696
5.25k
            PCRE2_SIZE delta;
7697
5.25k
            if (PRIV(ckd_smul)(&delta, repeat_max,
7698
5.25k
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7699
5.25k
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7700
13
              {
7701
13
              *errorcodeptr = ERR20;
7702
13
              return 0;
7703
13
              }
7704
5.24k
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7705
5.24k
            *lengthptr += delta;
7706
5.24k
            }
7707
7708
          /* This is compiling for real */
7709
7710
3.12M
          else for (uint32_t i = repeat_max; i >= 1; i--)
7711
3.07M
            {
7712
3.07M
            *code++ = OP_BRAZERO + repeat_type;
7713
7714
            /* All but the final copy start a new nesting, maintaining the
7715
            chain of brackets outstanding. */
7716
7717
3.07M
            if (i != 1)
7718
3.06M
              {
7719
3.06M
              int linkoffset;
7720
3.06M
              *code++ = OP_BRA;
7721
3.06M
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7722
3.06M
              bralink = code;
7723
3.06M
              PUTINC(code, 0, linkoffset);
7724
3.06M
              }
7725
7726
3.07M
            memcpy(code, previous, CU2BYTES(len));
7727
3.07M
            code += len;
7728
3.07M
            }
7729
7730
          /* Now chain through the pending brackets, and fill in their length
7731
          fields (which are holding the chain links pro tem). */
7732
7733
3.13M
          while (bralink != NULL)
7734
3.07M
            {
7735
3.07M
            int oldlinkoffset;
7736
3.07M
            int linkoffset = (int)(code - bralink + 1);
7737
3.07M
            PCRE2_UCHAR *bra = code - linkoffset;
7738
3.07M
            oldlinkoffset = GET(bra, 1);
7739
3.07M
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7740
3.07M
            *code++ = OP_KET;
7741
3.07M
            PUTINC(code, 0, linkoffset);
7742
3.07M
            PUT(bra, 1, linkoffset);
7743
3.07M
            }
7744
59.7k
          }
7745
7746
        /* If the maximum is unlimited, set a repeater in the final copy. For
7747
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7748
        possessively repeated ONCE brackets can be converted into non-capturing
7749
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7750
        saves having to deal with possessive ONCEs specially.
7751
7752
        Otherwise, when we are doing the actual compile phase, check to see
7753
        whether this group is one that could match an empty string. If so,
7754
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7755
        that runtime checking can be done. [This check is also applied to ONCE
7756
        and SCRIPT_RUN groups at runtime, but in a different way.]
7757
7758
        Then, if the quantifier was possessive and the bracket is not a
7759
        conditional, we convert the BRA code to the POS form, and the KET code
7760
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7761
        kind of subpattern at both the start and at the end.) The use of
7762
        special opcodes makes it possible to reduce greatly the stack usage in
7763
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7764
        OP_BRAPOSZERO.
7765
7766
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7767
        flag so that the default action below, of wrapping everything inside
7768
        atomic brackets, does not happen. When the minimum is greater than 1,
7769
        there will be earlier copies of the group, and so we still have to wrap
7770
        the whole thing. */
7771
7772
69.2k
        else
7773
69.2k
          {
7774
69.2k
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7775
69.2k
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7776
7777
          /* Convert possessive ONCE brackets to non-capturing */
7778
7779
69.2k
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7780
7781
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7782
          to do is to set the KET. */
7783
7784
69.2k
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7785
1.53k
            *ketcode = OP_KETRMAX + repeat_type;
7786
7787
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7788
          (which have been converted to non-capturing above). */
7789
7790
67.7k
          else
7791
67.7k
            {
7792
            /* In the compile phase, adjust the opcode if the group can match
7793
            an empty string. For a conditional group with only one branch, the
7794
            value of group_return will not show "could be empty", so we must
7795
            check that separately. */
7796
7797
67.7k
            if (lengthptr == NULL)
7798
33.3k
              {
7799
33.3k
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7800
33.3k
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7801
566
                *bracode = OP_SCOND;
7802
33.3k
              }
7803
7804
            /* Handle possessive quantifiers. */
7805
7806
67.7k
            if (possessive_quantifier)
7807
31.0k
              {
7808
              /* For COND brackets, we wrap the whole thing in a possessively
7809
              repeated non-capturing bracket, because we have not invented POS
7810
              versions of the COND opcodes. */
7811
7812
31.0k
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7813
3.68k
                {
7814
3.68k
                int nlen = (int)(code - bracode);
7815
3.68k
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7816
3.68k
                code += 1 + LINK_SIZE;
7817
3.68k
                nlen += 1 + LINK_SIZE;
7818
3.68k
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7819
3.68k
                *code++ = OP_KETRPOS;
7820
3.68k
                PUTINC(code, 0, nlen);
7821
3.68k
                PUT(bracode, 1, nlen);
7822
3.68k
                }
7823
7824
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7825
7826
27.3k
              else
7827
27.3k
                {
7828
27.3k
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7829
27.3k
                *ketcode = OP_KETRPOS;
7830
27.3k
                }
7831
7832
              /* If the minimum is zero, mark it as possessive, then unset the
7833
              possessive flag when the minimum is 0 or 1. */
7834
7835
31.0k
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7836
31.0k
              if (repeat_min < 2) possessive_quantifier = FALSE;
7837
31.0k
              }
7838
7839
            /* Non-possessive quantifier */
7840
7841
36.7k
            else *ketcode = OP_KETRMAX + repeat_type;
7842
67.7k
            }
7843
69.2k
          }
7844
129k
        }
7845
128k
      break;
7846
7847
      /* If previous was a character type match (\d or similar), abolish it and
7848
      create a suitable repeat item. The code is shared with single-character
7849
      repeats by setting op_type to add a suitable offset into repeat_type.
7850
      Note the the Unicode property types will be present only when
7851
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7852
      here because it just makes it horribly messy. */
7853
7854
223k
      default:
7855
7856
      /* LCOV_EXCL_START */
7857
223k
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7858
0
        {
7859
0
        PCRE2_DEBUG_UNREACHABLE();
7860
0
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7861
0
        return 0;
7862
0
        }
7863
      /* LCOV_EXCL_STOP */
7864
7865
223k
        {
7866
223k
        int prop_type, prop_value;
7867
223k
        PCRE2_UCHAR *oldcode;
7868
7869
223k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7870
7871
222k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7872
222k
        mclength = 0;                         /* Not a character */
7873
7874
222k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7875
30.7k
          {
7876
30.7k
          prop_type = previous[1];
7877
30.7k
          prop_value = previous[2];
7878
30.7k
          }
7879
192k
        else
7880
192k
          {
7881
          /* Come here from just above with a character in mcbuffer/mclength.
7882
          You must also set op_type before the jump. */
7883
616k
          OUTPUT_SINGLE_REPEAT:
7884
616k
          prop_type = prop_value = -1;
7885
616k
          }
7886
7887
        /* At this point, if prop_type == prop_value == -1 we either have a
7888
        character in mcbuffer when mclength is greater than zero, or we have
7889
        mclength zero, in which case there is a non-property character type in
7890
        op_previous. If prop_type/value are not negative, we have a property
7891
        character type in op_previous. */
7892
7893
647k
        oldcode = code;                   /* Save where we were */
7894
647k
        code = previous;                  /* Usually overwrite previous item */
7895
7896
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7897
        this case, so we do too - by simply omitting the item altogether. */
7898
7899
647k
        if (repeat_max == 0) goto END_REPEAT;
7900
7901
        /* Combine the op_type with the repeat_type */
7902
7903
645k
        repeat_type += op_type;
7904
7905
        /* A minimum of zero is handled either as the special case * or ?, or as
7906
        an UPTO, with the maximum given. */
7907
7908
645k
        if (repeat_min == 0)
7909
379k
          {
7910
379k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7911
166k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7912
22.5k
          else
7913
22.5k
            {
7914
22.5k
            *code++ = OP_UPTO + repeat_type;
7915
22.5k
            PUT2INC(code, 0, repeat_max);
7916
22.5k
            }
7917
379k
          }
7918
7919
        /* A repeat minimum of 1 is optimized into some special cases. If the
7920
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7921
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7922
        one less than the maximum. */
7923
7924
266k
        else if (repeat_min == 1)
7925
226k
          {
7926
226k
          if (repeat_max == REPEAT_UNLIMITED)
7927
222k
            *code++ = OP_PLUS + repeat_type;
7928
4.11k
          else
7929
4.11k
            {
7930
4.11k
            code = oldcode;  /* Leave previous item in place */
7931
4.11k
            if (repeat_max == 1) goto END_REPEAT;
7932
4.11k
            *code++ = OP_UPTO + repeat_type;
7933
4.11k
            PUT2INC(code, 0, repeat_max - 1);
7934
4.11k
            }
7935
226k
          }
7936
7937
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7938
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7939
7940
40.3k
        else
7941
40.3k
          {
7942
40.3k
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7943
40.3k
          PUT2INC(code, 0, repeat_min);
7944
7945
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7946
          and then generate the second opcode. For a repeated Unicode property
7947
          match, there are two extra values that define the required property,
7948
          and mclength is set zero to indicate this. */
7949
7950
40.3k
          if (repeat_max != repeat_min)
7951
10.1k
            {
7952
10.1k
            if (mclength > 0)
7953
5.48k
              {
7954
5.48k
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7955
5.48k
              code += mclength;
7956
5.48k
              }
7957
4.68k
            else
7958
4.68k
              {
7959
4.68k
              *code++ = op_previous;
7960
4.68k
              if (prop_type >= 0)
7961
1.00k
                {
7962
1.00k
                *code++ = prop_type;
7963
1.00k
                *code++ = prop_value;
7964
1.00k
                }
7965
4.68k
              }
7966
7967
            /* Now set up the following opcode */
7968
7969
10.1k
            if (repeat_max == REPEAT_UNLIMITED)
7970
7.03k
              *code++ = OP_STAR + repeat_type;
7971
3.13k
            else
7972
3.13k
              {
7973
3.13k
              repeat_max -= repeat_min;
7974
3.13k
              if (repeat_max == 1)
7975
1.06k
                {
7976
1.06k
                *code++ = OP_QUERY + repeat_type;
7977
1.06k
                }
7978
2.07k
              else
7979
2.07k
                {
7980
2.07k
                *code++ = OP_UPTO + repeat_type;
7981
2.07k
                PUT2INC(code, 0, repeat_max);
7982
2.07k
                }
7983
3.13k
              }
7984
10.1k
            }
7985
40.3k
          }
7986
7987
        /* Fill in the character or character type for the final opcode. */
7988
7989
645k
        if (mclength > 0)
7990
423k
          {
7991
423k
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7992
423k
          code += mclength;
7993
423k
          }
7994
222k
        else
7995
222k
          {
7996
222k
          *code++ = op_previous;
7997
222k
          if (prop_type >= 0)
7998
30.7k
            {
7999
30.7k
            *code++ = prop_type;
8000
30.7k
            *code++ = prop_value;
8001
30.7k
            }
8002
222k
          }
8003
645k
        }
8004
0
      break;
8005
883k
      }  /* End of switch on different op_previous values */
8006
8007
8008
    /* If the character following a repeat is '+', possessive_quantifier is
8009
    TRUE. For some opcodes, there are special alternative opcodes for this
8010
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
8011
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
8012
    Sun's Java package, but the special opcodes can optimize it.
8013
8014
    Some (but not all) possessively repeated subpatterns have already been
8015
    completely handled in the code just above. For them, possessive_quantifier
8016
    is always FALSE at this stage. Note that the repeated item starts at
8017
    tempcode, not at previous, which might be the first part of a string whose
8018
    (former) last char we repeated. */
8019
8020
877k
    if (possessive_quantifier)
8021
78.0k
      {
8022
78.0k
      int len;
8023
8024
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
8025
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
8026
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
8027
      remains is greater than zero, there's a further opcode that can be
8028
      handled. If not, do nothing, leaving the EXACT alone. */
8029
8030
78.0k
      switch(*tempcode)
8031
78.0k
        {
8032
1.55k
        case OP_TYPEEXACT:
8033
1.55k
        tempcode += PRIV(OP_lengths)[*tempcode] +
8034
1.55k
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
8035
1.10k
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
8036
1.55k
        break;
8037
8038
        /* CHAR opcodes are used for exacts whose count is 1. */
8039
8040
837
        case OP_CHAR:
8041
1.17k
        case OP_CHARI:
8042
2.48k
        case OP_NOT:
8043
3.09k
        case OP_NOTI:
8044
5.73k
        case OP_EXACT:
8045
6.55k
        case OP_EXACTI:
8046
7.44k
        case OP_NOTEXACT:
8047
8.52k
        case OP_NOTEXACTI:
8048
8.52k
        tempcode += PRIV(OP_lengths)[*tempcode];
8049
8.52k
#ifdef SUPPORT_UNICODE
8050
8.52k
        if (utf && HAS_EXTRALEN(tempcode[-1]))
8051
907
          tempcode += GET_EXTRALEN(tempcode[-1]);
8052
8.52k
#endif
8053
8.52k
        break;
8054
8055
        /* For the class opcodes, the repeat operator appears at the end;
8056
        adjust tempcode to point to it. */
8057
8058
6.06k
        case OP_CLASS:
8059
13.4k
        case OP_NCLASS:
8060
13.4k
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
8061
13.4k
        break;
8062
8063
0
#ifdef SUPPORT_WIDE_CHARS
8064
2.44k
        case OP_XCLASS:
8065
2.99k
        case OP_ECLASS:
8066
2.99k
        tempcode += GET(tempcode, 1);
8067
2.99k
        break;
8068
0
#endif
8069
8070
560
        case OP_REF:
8071
762
        case OP_REFI:
8072
846
        case OP_DNREF:
8073
1.00k
        case OP_DNREFI:
8074
1.00k
        tempcode += PRIV(OP_lengths)[*tempcode];
8075
1.00k
        break;
8076
78.0k
        }
8077
8078
      /* If tempcode is equal to code (which points to the end of the repeated
8079
      item), it means we have skipped an EXACT item but there is no following
8080
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
8081
      all other cases, tempcode will be pointing to the repeat opcode, and will
8082
      be less than code, so the value of len will be greater than 0. */
8083
8084
78.0k
      len = (int)(code - tempcode);
8085
78.0k
      if (len > 0)
8086
73.8k
        {
8087
73.8k
        unsigned int repcode = *tempcode;
8088
8089
        /* There is a table for possessifying opcodes, all of which are less
8090
        than OP_CALLOUT. A zero entry means there is no possessified version.
8091
        */
8092
8093
73.8k
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
8094
68.0k
          *tempcode = opcode_possessify[repcode];
8095
8096
        /* For opcode without a special possessified version, wrap the item in
8097
        ONCE brackets. */
8098
8099
5.84k
        else
8100
5.84k
          {
8101
5.84k
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
8102
5.84k
          code += 1 + LINK_SIZE;
8103
5.84k
          len += 1 + LINK_SIZE;
8104
5.84k
          tempcode[0] = OP_ONCE;
8105
5.84k
          *code++ = OP_KET;
8106
5.84k
          PUTINC(code, 0, len);
8107
5.84k
          PUT(tempcode, 1, len);
8108
5.84k
          }
8109
73.8k
        }
8110
78.0k
      }
8111
8112
    /* We set the "follows varying string" flag for subsequently encountered
8113
    reqcus if it isn't already set and we have just passed a varying length
8114
    item. */
8115
8116
883k
    END_REPEAT:
8117
883k
    cb->req_varyopt |= reqvary;
8118
883k
    break;
8119
8120
8121
    /* ===================================================================*/
8122
    /* Handle a 32-bit data character with a value greater than META_END. */
8123
8124
0
    case META_BIGVALUE:
8125
0
    pptr++;
8126
0
    goto NORMAL_CHAR;
8127
8128
8129
    /* ===============================================================*/
8130
    /* Handle a back reference by number, which is the meta argument. The
8131
    pattern offsets for back references to group numbers less than 10 are held
8132
    in a special vector, to avoid using more than two parsed pattern elements
8133
    in 64-bit environments. We only need the offset to the first occurrence,
8134
    because if that doesn't fail, subsequent ones will also be OK. */
8135
8136
21.0k
    case META_BACKREF:
8137
21.0k
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8138
2.95k
      else GETPLUSOFFSET(offset, pptr);
8139
8140
21.0k
    if (meta_arg > cb->bracount)
8141
1.03k
      {
8142
1.03k
      cb->erroroffset = offset;
8143
1.03k
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8144
1.03k
      return 0;
8145
1.03k
      }
8146
8147
    /* Come here from named backref handling when the reference is to a
8148
    single group (that is, not to a duplicated name). The back reference
8149
    data will have already been updated. We must disable firstcu if not
8150
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8151
    later. */
8152
8153
20.9k
    HANDLE_SINGLE_REFERENCE:
8154
20.9k
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8155
20.9k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8156
20.9k
    PUT2INC(code, 0, meta_arg);
8157
20.9k
    if ((options & PCRE2_CASELESS) != 0)
8158
6.36k
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8159
6.18k
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
8160
6.36k
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
8161
6.36k
                 REFI_FLAG_TURKISH_CASING : 0);
8162
8163
    /* Update the map of back references, and keep the highest one. We
8164
    could do this in parse_regex() for numerical back references, but not
8165
    for named back references, because we don't know the numbers to which
8166
    named back references refer. So we do it all in this function. */
8167
8168
20.9k
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8169
20.9k
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8170
20.9k
    break;
8171
8172
8173
    /* ===============================================================*/
8174
    /* Handle recursion by inserting the number of the called group (which is
8175
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8176
    scanned and these numbers are replaced by offsets within the pattern. It is
8177
    done like this to avoid problems with forward references and adjusting
8178
    offsets when groups are duplicated and moved (as discovered in previous
8179
    implementations). Note that a recursion does not have a set first
8180
    character. */
8181
8182
133k
    case META_RECURSE:
8183
133k
    GETPLUSOFFSET(offset, pptr);
8184
133k
    if (meta_arg > cb->bracount)
8185
211
      {
8186
211
      cb->erroroffset = offset;
8187
211
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8188
211
      return 0;
8189
211
      }
8190
133k
    HANDLE_NUMERICAL_RECURSION:
8191
133k
    *code = OP_RECURSE;
8192
133k
    PUT(code, 1, meta_arg);
8193
133k
    code += 1 + LINK_SIZE;
8194
    /* Repeat processing requires this information to
8195
    determine the real length in pre-compile phase. */
8196
133k
    length_prevgroup = 1 + LINK_SIZE;
8197
8198
133k
    if (META_CODE(pptr[1]) == META_OFFSET ||
8199
133k
        META_CODE(pptr[1]) == META_CAPTURE_NAME ||
8200
131k
        META_CODE(pptr[1]) == META_CAPTURE_NUMBER)
8201
8.16k
      {
8202
8.16k
      recurse_arguments *args;
8203
8204
8.16k
      if (lengthptr != NULL)
8205
4.13k
        {
8206
4.13k
        if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))
8207
42
          return 0;
8208
8209
4.09k
        args = (recurse_arguments*)cb->last_data;
8210
4.09k
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8211
4.09k
        *lengthptr += (args->size * (1 + IMM2_SIZE));
8212
4.09k
        pptr += args->skip_size;
8213
4.09k
        }
8214
4.02k
      else
8215
4.02k
        {
8216
4.02k
        uint16_t *current, *end;
8217
8218
4.02k
        args = (recurse_arguments*)cb->first_data;
8219
4.02k
        PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);
8220
8221
4.02k
        current = (uint16_t*)(args + 1);
8222
4.02k
        end = current + args->size;
8223
4.02k
        PCRE2_ASSERT(end > current);
8224
8225
4.02k
        do
8226
13.8k
          {
8227
13.8k
          code[0] = OP_CREF;
8228
13.8k
          PUT2(code, 1, *current);
8229
13.8k
          code += 1 + IMM2_SIZE;
8230
13.8k
          }
8231
13.8k
        while (++current < end);
8232
8233
4.02k
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8234
4.02k
        pptr += args->skip_size;
8235
4.02k
        cb->first_data = args->header.next;
8236
4.02k
        cb->cx->memctl.free(args, cb->cx->memctl.memory_data);
8237
4.02k
        }
8238
8.16k
      }
8239
8240
133k
    groupsetfirstcu = FALSE;
8241
133k
    cb->had_recurse = TRUE;
8242
133k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8243
133k
    zerofirstcu = firstcu;
8244
133k
    zerofirstcuflags = firstcuflags;
8245
133k
    break;
8246
8247
8248
    /* ===============================================================*/
8249
    /* Handle capturing parentheses; the number is the meta argument. */
8250
8251
917k
    case META_CAPTURE:
8252
917k
    bravalue = OP_CBRA;
8253
917k
    skipunits = IMM2_SIZE;
8254
917k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8255
917k
    cb->lastcapture = meta_arg;
8256
917k
    goto GROUP_PROCESS_NOTE_EMPTY;
8257
8258
8259
    /* ===============================================================*/
8260
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8261
    arranged to be the same as the corresponding OP_values in the default case
8262
    when PCRE2_UCP is not set (which is the only case in which they will appear
8263
    here).
8264
8265
    Note: \Q and \E are never seen here, as they were dealt with in
8266
    parse_pattern(). Neither are numerical back references or recursions, which
8267
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8268
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8269
    META_RECURSE_BYNAME. */
8270
8271
418k
    case META_ESCAPE:
8272
8273
    /* We can test for escape sequences that consume a character because their
8274
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8275
    are ever created. For these sequences, we disable the setting of a first
8276
    character if it hasn't already been set. */
8277
8278
418k
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8279
366k
      {
8280
366k
      matched_char = TRUE;
8281
366k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8282
366k
      }
8283
8284
    /* Set values to reset to if this is followed by a zero repeat. */
8285
8286
418k
    zerofirstcu = firstcu;
8287
418k
    zerofirstcuflags = firstcuflags;
8288
418k
    zeroreqcu = reqcu;
8289
418k
    zeroreqcuflags = reqcuflags;
8290
8291
    /* If Unicode is not supported, \P and \p are not allowed and are
8292
    faulted at parse time, so will never appear here. */
8293
8294
418k
#ifdef SUPPORT_UNICODE
8295
418k
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8296
63.4k
      {
8297
63.4k
      uint32_t ptype = *(++pptr) >> 16;
8298
63.4k
      uint32_t pdata = *pptr & 0xffff;
8299
8300
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8301
      converted to the general characteristic L&. That is, upper, lower, and
8302
      title case letters are all conflated. */
8303
8304
63.4k
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8305
8.40k
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8306
535
        {
8307
535
        ptype = PT_LAMP;
8308
535
        pdata = 0;
8309
535
        }
8310
8311
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8312
      is compiled to [] so as to benefit from the auto-anchoring code. */
8313
8314
63.4k
      if (ptype == PT_ANY)
8315
324
        {
8316
324
        if (meta_arg == ESC_P)
8317
104
          {
8318
104
          *code++ = OP_CLASS;
8319
104
          memset(code, 0, 32);
8320
104
          code += 32 / sizeof(PCRE2_UCHAR);
8321
104
          }
8322
220
        else
8323
220
          *code++ = OP_ALLANY;
8324
324
        }
8325
63.1k
      else
8326
63.1k
        {
8327
63.1k
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8328
63.1k
        *code++ = ptype;
8329
63.1k
        *code++ = pdata;
8330
63.1k
        }
8331
63.4k
      break;  /* End META_ESCAPE */
8332
63.4k
      }
8333
354k
#endif
8334
8335
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8336
    done. However, there's an option, in case anyone was relying on it. */
8337
8338
354k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8339
8
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8340
8
      {
8341
8
      *errorcodeptr = ERR99;
8342
8
      return 0;
8343
8
      }
8344
8345
    /* For the rest (including \X when Unicode is supported - if not it's
8346
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8347
    not set; if it is set, most of them do not show up here because they are
8348
    converted into Unicode property tests in parse_regex().
8349
8350
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8351
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8352
    There are special UCP codes for \B and \b which are used in UCP mode unless
8353
    "word" matching is being forced to ASCII.
8354
8355
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8356
    if it does. */
8357
8358
354k
    switch(meta_arg)
8359
354k
      {
8360
0
      case ESC_C:
8361
0
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8362
#if PCRE2_CODE_UNIT_WIDTH == 32
8363
      meta_arg = OP_ALLANY;
8364
      (void)utf; /* Avoid compiler warning. */
8365
#else
8366
0
      if (!utf) meta_arg = OP_ALLANY;
8367
0
#endif
8368
0
      break;
8369
8370
11.5k
      case ESC_B:
8371
25.1k
      case ESC_b:
8372
25.1k
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8373
6.17k
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8374
6.17k
          OP_UCP_WORD_BOUNDARY;
8375
25.1k
      PCRE2_FALLTHROUGH /* Fall through */
8376
8377
33.2k
      case ESC_A:
8378
33.2k
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8379
33.2k
      break;
8380
8381
1.50k
      case ESC_K:
8382
1.50k
      cb->external_flags |= PCRE2_HASBSK;  /* Record */
8383
1.50k
      break;
8384
354k
      }
8385
8386
354k
    *code++ = meta_arg;
8387
354k
    break;  /* End META_ESCAPE */
8388
8389
8390
    /* ===================================================================*/
8391
    /* Handle an unrecognized meta value. A parsed pattern value less than
8392
    META_END is a literal. Otherwise we have a problem. */
8393
8394
7.60M
    default:
8395
    /* LCOV_EXCL_START */
8396
7.60M
    if (meta >= META_END)
8397
0
      {
8398
0
      PCRE2_DEBUG_UNREACHABLE();
8399
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8400
0
      return 0;
8401
0
      }
8402
    /* LCOV_EXCL_STOP */
8403
8404
    /* Handle a literal character. We come here by goto in the case of a
8405
    32-bit, non-UTF character whose value is greater than META_END. */
8406
8407
7.60M
    NORMAL_CHAR:
8408
7.60M
    meta = *pptr;     /* Get the full 32 bits */
8409
7.61M
    NORMAL_CHAR_SET:  /* Character is already in meta */
8410
7.61M
    matched_char = TRUE;
8411
8412
    /* For caseless UTF or UCP mode, check whether this character has more than
8413
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8414
    When casing restrictions apply, ignore caseless sets that start with an
8415
    ASCII character. If the character is affected by the special Turkish rules,
8416
    hardcode the matching characters using a caseset. */
8417
8418
7.61M
#ifdef SUPPORT_UNICODE
8419
7.61M
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8420
1.34M
      {
8421
1.34M
      uint32_t caseset;
8422
8423
1.34M
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8424
1.34M
            PCRE2_EXTRA_TURKISH_CASING &&
8425
0
          UCD_ANY_I(meta))
8426
0
        {
8427
0
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8428
0
        }
8429
1.34M
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8430
114k
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8431
1.22k
               PRIV(ucd_caseless_sets)[caseset] < 128)
8432
875
        {
8433
875
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8434
875
        }
8435
8436
1.34M
      if (caseset != 0)
8437
113k
        {
8438
113k
        *code++ = OP_PROP;
8439
113k
        *code++ = PT_CLIST;
8440
113k
        *code++ = caseset;
8441
113k
        if (firstcuflags == REQ_UNSET)
8442
3.14k
          firstcuflags = zerofirstcuflags = REQ_NONE;
8443
113k
        break;  /* End handling this meta item */
8444
113k
        }
8445
1.34M
      }
8446
7.49M
#endif
8447
8448
    /* Caseful matches, or caseless and not one of the multicase characters. We
8449
    come here by goto in the case of a positive class that contains only
8450
    case-partners of a character with just two cases; matched_char has already
8451
    been set TRUE and options fudged if necessary. */
8452
8453
7.49M
    CLASS_CASELESS_CHAR:
8454
8455
    /* Get the character's code units into mcbuffer, with the length in
8456
    mclength. When not in UTF mode, the length is always 1. */
8457
8458
7.49M
#ifdef SUPPORT_UNICODE
8459
7.49M
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8460
6.81M
#endif
8461
6.81M
      {
8462
6.81M
      mclength = 1;
8463
6.81M
      mcbuffer[0] = meta;
8464
6.81M
      }
8465
8466
    /* Generate the appropriate code */
8467
8468
7.49M
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8469
7.49M
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8470
7.49M
    code += mclength;
8471
8472
    /* Remember if \r or \n were seen */
8473
8474
7.49M
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8475
94.6k
      cb->external_flags |= PCRE2_HASCRORLF;
8476
8477
    /* Set the first and required code units appropriately. If no previous
8478
    first code unit, set it from this character, but revert to none on a zero
8479
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8480
    a zero repeat. */
8481
8482
7.49M
    if (firstcuflags == REQ_UNSET)
8483
426k
      {
8484
426k
      zerofirstcuflags = REQ_NONE;
8485
426k
      zeroreqcu = reqcu;
8486
426k
      zeroreqcuflags = reqcuflags;
8487
8488
      /* If the character is more than one code unit long, we can set a single
8489
      firstcu only if it is not to be matched caselessly. Multiple possible
8490
      starting code units may be picked up later in the studying code. */
8491
8492
426k
      if (mclength == 1 || req_caseopt == 0)
8493
423k
        {
8494
423k
        firstcu = mcbuffer[0];
8495
423k
        firstcuflags = req_caseopt;
8496
423k
        if (mclength != 1)
8497
1.58k
          {
8498
1.58k
          reqcu = code[-1];
8499
1.58k
          reqcuflags = cb->req_varyopt;
8500
1.58k
          }
8501
423k
        }
8502
2.29k
      else firstcuflags = reqcuflags = REQ_NONE;
8503
426k
      }
8504
8505
    /* firstcu was previously set; we can set reqcu only if the length is
8506
    1 or the matching is caseful. */
8507
8508
7.07M
    else
8509
7.07M
      {
8510
7.07M
      zerofirstcu = firstcu;
8511
7.07M
      zerofirstcuflags = firstcuflags;
8512
7.07M
      zeroreqcu = reqcu;
8513
7.07M
      zeroreqcuflags = reqcuflags;
8514
7.07M
      if (mclength == 1 || req_caseopt == 0)
8515
7.05M
        {
8516
7.05M
        reqcu = code[-1];
8517
7.05M
        reqcuflags = req_caseopt | cb->req_varyopt;
8518
7.05M
        }
8519
7.07M
      }
8520
8521
    /* If caselessness was temporarily instated, reset it. */
8522
8523
7.49M
    if (reset_caseful)
8524
480
      {
8525
480
      options &= ~PCRE2_CASELESS;
8526
480
      req_caseopt = 0;
8527
480
      reset_caseful = FALSE;
8528
480
      }
8529
8530
7.49M
    break;    /* End literal character handling */
8531
15.0M
    }         /* End of big switch */
8532
15.0M
  }           /* End of big loop */
8533
8534
/* LCOV_EXCL_START */
8535
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8536
0
return 0;                  /* Avoid compiler warnings */
8537
/* LCOV_EXCL_STOP */
8538
2.14M
}
8539
8540
8541
8542
/*************************************************
8543
*   Compile regex: a sequence of alternatives    *
8544
*************************************************/
8545
8546
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8547
the closing bracket or META_END. The code variable is pointing at the code unit
8548
into which the BRA operator has been stored. This function is used during the
8549
pre-compile phase when we are trying to find out the amount of memory needed,
8550
as well as during the real compile phase. The value of lengthptr distinguishes
8551
the two phases.
8552
8553
Arguments:
8554
  options           option bits, including any changes for this subpattern
8555
  xoptions          extra option bits, ditto
8556
  codeptr           -> the address of the current code pointer
8557
  pptrptr           -> the address of the current parsed pattern pointer
8558
  errorcodeptr      -> pointer to error code variable
8559
  skipunits         skip this many code units at start (for brackets and OP_COND)
8560
  firstcuptr        place to put the first required code unit
8561
  firstcuflagsptr   place to put the first code unit flags
8562
  reqcuptr          place to put the last required code unit
8563
  reqcuflagsptr     place to put the last required code unit flags
8564
  bcptr             pointer to the chain of currently open branches
8565
  cb                points to the data block with tables pointers etc.
8566
  lengthptr         NULL during the real compile phase
8567
                    points to length accumulator during pre-compile phase
8568
8569
Returns:            0 There has been an error
8570
                   +1 Success, this group must match at least one character
8571
                   -1 Success, this group may match an empty string
8572
*/
8573
8574
static int
8575
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8576
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8577
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8578
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8579
  compile_block *cb, PCRE2_SIZE *lengthptr)
8580
1.26M
{
8581
1.26M
PCRE2_UCHAR *code = *codeptr;
8582
1.26M
PCRE2_UCHAR *last_branch = code;
8583
1.26M
PCRE2_UCHAR *start_bracket = code;
8584
1.26M
BOOL lookbehind;
8585
1.26M
open_capitem capitem;
8586
1.26M
int capnumber = 0;
8587
1.26M
int okreturn = 1;
8588
1.26M
uint32_t *pptr = *pptrptr;
8589
1.26M
uint32_t firstcu, reqcu;
8590
1.26M
uint32_t lookbehindlength;
8591
1.26M
uint32_t lookbehindminlength;
8592
1.26M
uint32_t firstcuflags, reqcuflags;
8593
1.26M
PCRE2_SIZE length;
8594
1.26M
branch_chain bc;
8595
8596
/* If set, call the external function that checks for stack availability. */
8597
8598
1.26M
if (cb->cx->stack_guard != NULL &&
8599
0
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8600
0
  {
8601
0
  *errorcodeptr= ERR33;
8602
0
  cb->erroroffset = 0;
8603
0
  return 0;
8604
0
  }
8605
8606
/* Miscellaneous initialization */
8607
8608
1.26M
bc.outer = bcptr;
8609
1.26M
bc.current_branch = code;
8610
8611
1.26M
firstcu = reqcu = 0;
8612
1.26M
firstcuflags = reqcuflags = REQ_UNSET;
8613
8614
/* Accumulate the length for use in the pre-compile phase. Start with the
8615
length of the BRA and KET and any extra code units that are required at the
8616
beginning. We accumulate in a local variable to save frequent testing of
8617
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8618
start and end of each alternative, because compiled items are discarded during
8619
the pre-compile phase so that the workspace is not exceeded. */
8620
8621
1.26M
length = 2 + 2*LINK_SIZE + skipunits;
8622
8623
/* Remember if this is a lookbehind assertion, and if it is, save its length
8624
and skip over the pattern offset. */
8625
8626
1.26M
lookbehind = *code == OP_ASSERTBACK ||
8627
1.26M
             *code == OP_ASSERTBACK_NOT ||
8628
1.25M
             *code == OP_ASSERTBACK_NA;
8629
8630
1.26M
if (lookbehind)
8631
19.8k
  {
8632
19.8k
  lookbehindlength = META_DATA(pptr[-1]);
8633
19.8k
  lookbehindminlength = *pptr;
8634
19.8k
  pptr += SIZEOFFSET;
8635
19.8k
  }
8636
1.24M
else lookbehindlength = lookbehindminlength = 0;
8637
8638
/* If this is a capturing subpattern, add to the chain of open capturing items
8639
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8640
need be tested here; changing this opcode to one of its variants, e.g.
8641
OP_SCBRAPOS, happens later, after the group has been compiled. */
8642
8643
1.26M
if (*code == OP_CBRA)
8644
917k
  {
8645
917k
  capnumber = GET2(code, 1 + LINK_SIZE);
8646
917k
  capitem.number = capnumber;
8647
917k
  capitem.next = open_caps;
8648
917k
  capitem.assert_depth = cb->assert_depth;
8649
917k
  open_caps = &capitem;
8650
917k
  }
8651
8652
/* Offset is set zero to mark that this bracket is still open */
8653
8654
1.26M
PUT(code, 1, 0);
8655
1.26M
code += 1 + LINK_SIZE + skipunits;
8656
8657
/* Loop for each alternative branch */
8658
8659
1.26M
for (;;)
8660
2.14M
  {
8661
2.14M
  int branch_return;
8662
2.14M
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8663
2.14M
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8664
8665
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8666
  is only a single minimum length for the whole assertion. When the minimum
8667
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8668
  though not necessarily the same length. In this case, the original OP_REVERSE
8669
  can be used. It can also be used if a branch in a variable length lookbehind
8670
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8671
  maximum and minimum values. */
8672
8673
2.14M
  if (lookbehind && lookbehindlength > 0)
8674
19.5k
    {
8675
19.5k
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8676
7.60k
        lookbehindminlength == lookbehindlength)
8677
12.6k
      {
8678
12.6k
      *code++ = OP_REVERSE;
8679
12.6k
      PUT2INC(code, 0, lookbehindlength);
8680
12.6k
      length += 1 + IMM2_SIZE;
8681
12.6k
      }
8682
6.92k
    else
8683
6.92k
      {
8684
6.92k
      *code++ = OP_VREVERSE;
8685
6.92k
      PUT2INC(code, 0, lookbehindminlength);
8686
6.92k
      PUT2INC(code, 0, lookbehindlength);
8687
6.92k
      length += 1 + 2*IMM2_SIZE;
8688
6.92k
      }
8689
19.5k
    }
8690
8691
  /* Now compile the branch; in the pre-compile phase its length gets added
8692
  into the length. */
8693
8694
2.14M
  if ((branch_return =
8695
2.14M
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8696
2.14M
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8697
2.14M
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8698
2.87k
    return 0;
8699
8700
  /* If a branch can match an empty string, so can the whole group. */
8701
8702
2.13M
  if (branch_return < 0) okreturn = -1;
8703
8704
  /* In the real compile phase, there is some post-processing to be done. */
8705
8706
2.13M
  if (lengthptr == NULL)
8707
1.05M
    {
8708
    /* If this is the first branch, the firstcu and reqcu values for the
8709
    branch become the values for the regex. */
8710
8711
1.05M
    if (*last_branch != OP_ALT)
8712
626k
      {
8713
626k
      firstcu = branchfirstcu;
8714
626k
      firstcuflags = branchfirstcuflags;
8715
626k
      reqcu = branchreqcu;
8716
626k
      reqcuflags = branchreqcuflags;
8717
626k
      }
8718
8719
    /* If this is not the first branch, the first char and reqcu have to
8720
    match the values from all the previous branches, except that if the
8721
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8722
    and we set REQ_VARY for the group from this branch's value. */
8723
8724
431k
    else
8725
431k
      {
8726
      /* If we previously had a firstcu, but it doesn't match the new branch,
8727
      we have to abandon the firstcu for the regex, but if there was
8728
      previously no reqcu, it takes on the value of the old firstcu. */
8729
8730
431k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8731
353k
        {
8732
353k
        if (firstcuflags < REQ_NONE)
8733
23.1k
          {
8734
23.1k
          if (reqcuflags >= REQ_NONE)
8735
4.95k
            {
8736
4.95k
            reqcu = firstcu;
8737
4.95k
            reqcuflags = firstcuflags;
8738
4.95k
            }
8739
23.1k
          }
8740
353k
        firstcuflags = REQ_NONE;
8741
353k
        }
8742
8743
      /* If we (now or from before) have no firstcu, a firstcu from the
8744
      branch becomes a reqcu if there isn't a branch reqcu. */
8745
8746
431k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8747
73.3k
          branchreqcuflags >= REQ_NONE)
8748
15.8k
        {
8749
15.8k
        branchreqcu = branchfirstcu;
8750
15.8k
        branchreqcuflags = branchfirstcuflags;
8751
15.8k
        }
8752
8753
      /* Now ensure that the reqcus match */
8754
8755
431k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8756
46.1k
          reqcu != branchreqcu)
8757
410k
        reqcuflags = REQ_NONE;
8758
21.1k
      else
8759
21.1k
        {
8760
21.1k
        reqcu = branchreqcu;
8761
21.1k
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8762
21.1k
        }
8763
431k
      }
8764
1.05M
    }
8765
8766
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8767
  In the real compile phase, go back through the alternative branches and
8768
  reverse the chain of offsets, with the field in the BRA item now becoming an
8769
  offset to the first alternative. If there are no alternatives, it points to
8770
  the end of the group. The length in the terminating ket is always the length
8771
  of the whole bracketed item. Return leaving the pointer at the terminating
8772
  char. */
8773
8774
2.13M
  if (META_CODE(*pptr) != META_ALT)
8775
1.26M
    {
8776
1.26M
    if (lengthptr == NULL)
8777
626k
      {
8778
626k
      uint32_t branch_length = (uint32_t)(code - last_branch);
8779
626k
      do
8780
1.05M
        {
8781
1.05M
        uint32_t prev_length = GET(last_branch, 1);
8782
1.05M
        PUT(last_branch, 1, branch_length);
8783
1.05M
        branch_length = prev_length;
8784
1.05M
        last_branch -= branch_length;
8785
1.05M
        }
8786
1.05M
      while (branch_length > 0);
8787
626k
      }
8788
8789
    /* Fill in the ket */
8790
8791
1.26M
    *code = OP_KET;
8792
1.26M
    PUT(code, 1, (uint32_t)(code - start_bracket));
8793
1.26M
    code += 1 + LINK_SIZE;
8794
8795
    /* Set values to pass back */
8796
8797
1.26M
    *codeptr = code;
8798
1.26M
    *pptrptr = pptr;
8799
1.26M
    *firstcuptr = firstcu;
8800
1.26M
    *firstcuflagsptr = firstcuflags;
8801
1.26M
    *reqcuptr = reqcu;
8802
1.26M
    *reqcuflagsptr = reqcuflags;
8803
1.26M
    if (lengthptr != NULL)
8804
638k
      {
8805
638k
      if (OFLOW_MAX - *lengthptr < length)
8806
0
        {
8807
0
        *errorcodeptr = ERR20;
8808
0
        return 0;
8809
0
        }
8810
638k
      *lengthptr += length;
8811
638k
      }
8812
1.26M
    return okreturn;
8813
1.26M
    }
8814
8815
  /* Another branch follows. In the pre-compile phase, we can move the code
8816
  pointer back to where it was for the start of the first branch. (That is,
8817
  pretend that each branch is the only one.)
8818
8819
  In the real compile phase, insert an ALT node. Its length field points back
8820
  to the previous branch while the bracket remains open. At the end the chain
8821
  is reversed. It's done like this so that the start of the bracket has a
8822
  zero offset until it is closed, making it possible to detect recursion. */
8823
8824
874k
  if (lengthptr != NULL)
8825
443k
    {
8826
443k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8827
443k
    length += 1 + LINK_SIZE;
8828
443k
    }
8829
431k
  else
8830
431k
    {
8831
431k
    *code = OP_ALT;
8832
431k
    PUT(code, 1, (int)(code - last_branch));
8833
431k
    bc.current_branch = last_branch = code;
8834
431k
    code += 1 + LINK_SIZE;
8835
431k
    }
8836
8837
  /* Set the maximum lookbehind length for the next branch (if not in a
8838
  lookbehind the value will be zero) and then advance past the vertical bar. */
8839
8840
874k
  lookbehindlength = META_DATA(*pptr);
8841
874k
  pptr++;
8842
874k
  }
8843
8844
/* LCOV_EXCL_START */
8845
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8846
0
return 0;                  /* Avoid compiler warnings */
8847
/* LCOV_EXCL_STOP */
8848
1.26M
}
8849
8850
8851
8852
/*************************************************
8853
*          Check for anchored pattern            *
8854
*************************************************/
8855
8856
/* Try to find out if this is an anchored regular expression. Consider each
8857
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8858
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8859
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8860
be found, because ^ generates OP_CIRCM in that mode.
8861
8862
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8863
This is the code for \G, which means "match at start of match position, taking
8864
into account the match offset".
8865
8866
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8867
because that will try the rest of the pattern at all possible matching points,
8868
so there is no point trying again.... er ....
8869
8870
.... except when the .* appears inside capturing parentheses, and there is a
8871
subsequent back reference to those parentheses. We haven't enough information
8872
to catch that case precisely.
8873
8874
At first, the best we could do was to detect when .* was in capturing brackets
8875
and the highest back reference was greater than or equal to that level.
8876
However, by keeping a bitmap of the first 31 back references, we can catch some
8877
of the more common cases more precisely.
8878
8879
... A second exception is when the .* appears inside an atomic group, because
8880
this prevents the number of characters it matches from being adjusted.
8881
8882
Arguments:
8883
  code           points to start of the compiled pattern
8884
  bracket_map    a bitmap of which brackets we are inside while testing; this
8885
                   handles up to substring 31; after that we just have to take
8886
                   the less precise approach
8887
  cb             points to the compile data block
8888
  atomcount      atomic group level
8889
  inassert       TRUE if in an assertion
8890
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8891
8892
Returns:     TRUE or FALSE
8893
*/
8894
8895
static BOOL
8896
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8897
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8898
84.0k
{
8899
87.1k
do {
8900
87.1k
   PCRE2_SPTR scode = first_significant_code(
8901
87.1k
     code + PRIV(OP_lengths)[*code], FALSE);
8902
87.1k
   int op = *scode;
8903
8904
   /* Non-capturing brackets */
8905
8906
87.1k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8907
85.2k
       op == OP_SBRA || op == OP_SBRAPOS)
8908
2.49k
     {
8909
2.49k
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8910
1.88k
       return FALSE;
8911
2.49k
     }
8912
8913
   /* Capturing brackets */
8914
8915
84.6k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8916
78.5k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8917
7.12k
     {
8918
7.12k
     int n = GET2(scode, 1+LINK_SIZE);
8919
7.12k
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8920
7.12k
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8921
7.12k
     }
8922
8923
   /* Positive forward assertion */
8924
8925
77.5k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8926
1.93k
     {
8927
1.93k
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8928
1.93k
     }
8929
8930
   /* Condition. If there is no second branch, it can't be anchored. */
8931
8932
75.5k
   else if (op == OP_COND || op == OP_SCOND)
8933
702
     {
8934
702
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8935
237
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8936
161
       return FALSE;
8937
237
     }
8938
8939
   /* Atomic groups */
8940
8941
74.8k
   else if (op == OP_ONCE)
8942
669
     {
8943
669
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8944
561
       return FALSE;
8945
669
     }
8946
8947
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8948
   it isn't in brackets that are or may be referenced or inside an atomic
8949
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8950
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8951
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8952
   There is also an option that disables auto-anchoring. */
8953
8954
74.2k
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8955
68.4k
             op == OP_TYPEPOSSTAR))
8956
6.96k
     {
8957
6.96k
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8958
1.89k
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8959
5.17k
       return FALSE;
8960
6.96k
     }
8961
8962
   /* Check for explicit anchoring */
8963
8964
67.2k
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8965
8966
5.58k
   code += GET(code, 1);
8967
5.58k
   }
8968
84.0k
while (*code == OP_ALT);   /* Loop for each alternative */
8969
2.47k
return TRUE;
8970
84.0k
}
8971
8972
8973
8974
/*************************************************
8975
*         Check for starting with ^ or .*        *
8976
*************************************************/
8977
8978
/* This is called to find out if every branch starts with ^ or .* so that
8979
"first char" processing can be done to speed things up in multiline
8980
matching and for non-DOTALL patterns that start with .* (which must start at
8981
the beginning or after \n). As in the case of is_anchored() (see above), we
8982
have to take account of back references to capturing brackets that contain .*
8983
because in that case we can't make the assumption. Also, the appearance of .*
8984
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8985
or *SKIP does not count, because once again the assumption no longer holds.
8986
8987
Arguments:
8988
  code           points to start of the compiled pattern or a group
8989
  bracket_map    a bitmap of which brackets we are inside while testing; this
8990
                   handles up to substring 31; after that we just have to take
8991
                   the less precise approach
8992
  cb             points to the compile data
8993
  atomcount      atomic group level
8994
  inassert       TRUE if in an assertion
8995
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8996
8997
Returns:         TRUE or FALSE
8998
*/
8999
9000
static BOOL
9001
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
9002
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
9003
48.3k
{
9004
53.8k
do {
9005
53.8k
   PCRE2_SPTR scode = first_significant_code(
9006
53.8k
     code + PRIV(OP_lengths)[*code], FALSE);
9007
53.8k
   int op = *scode;
9008
9009
   /* If we are at the start of a conditional assertion group, *both* the
9010
   conditional assertion *and* what follows the condition must satisfy the test
9011
   for start of line. Other kinds of condition fail. Note that there may be an
9012
   auto-callout at the start of a condition. */
9013
9014
53.8k
   if (op == OP_COND)
9015
466
     {
9016
466
     scode += 1 + LINK_SIZE;
9017
9018
466
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
9019
366
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
9020
9021
466
     switch (*scode)
9022
466
       {
9023
17
       case OP_CREF:
9024
28
       case OP_DNCREF:
9025
62
       case OP_RREF:
9026
75
       case OP_DNRREF:
9027
95
       case OP_FAIL:
9028
102
       case OP_FALSE:
9029
113
       case OP_TRUE:
9030
113
       return FALSE;
9031
9032
353
       default:     /* Assertion */
9033
353
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9034
324
         return FALSE;
9035
102
       do scode += GET(scode, 1); while (*scode == OP_ALT);
9036
29
       scode += 1 + LINK_SIZE;
9037
29
       break;
9038
466
       }
9039
29
     scode = first_significant_code(scode, FALSE);
9040
29
     op = *scode;
9041
29
     }
9042
9043
   /* Non-capturing brackets */
9044
9045
53.3k
   if (op == OP_BRA  || op == OP_BRAPOS ||
9046
52.1k
       op == OP_SBRA || op == OP_SBRAPOS)
9047
1.98k
     {
9048
1.98k
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
9049
1.10k
       return FALSE;
9050
1.98k
     }
9051
9052
   /* Capturing brackets */
9053
9054
51.3k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
9055
47.3k
            op == OP_SCBRA || op == OP_SCBRAPOS)
9056
6.20k
     {
9057
6.20k
     int n = GET2(scode, 1+LINK_SIZE);
9058
6.20k
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
9059
6.20k
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
9060
3.84k
       return FALSE;
9061
6.20k
     }
9062
9063
   /* Positive forward assertions */
9064
9065
45.1k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
9066
759
     {
9067
759
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9068
677
       return FALSE;
9069
759
     }
9070
9071
   /* Atomic brackets */
9072
9073
44.4k
   else if (op == OP_ONCE)
9074
420
     {
9075
420
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
9076
338
       return FALSE;
9077
420
     }
9078
9079
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
9080
   brackets that may be referenced or an assertion, and as long as the pattern
9081
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
9082
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
9083
   i.e. not at the start of a line. There is also an option that disables this
9084
   optimization. */
9085
9086
44.0k
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
9087
8.22k
     {
9088
8.22k
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
9089
5.36k
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
9090
2.93k
       return FALSE;
9091
8.22k
     }
9092
9093
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
9094
   in particular that this includes atomic brackets OP_ONCE because the number
9095
   of characters matched by .* cannot be adjusted inside them. */
9096
9097
35.7k
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
9098
9099
   /* Move on to the next alternative */
9100
9101
10.0k
   code += GET(code, 1);
9102
10.0k
   }
9103
48.3k
while (*code == OP_ALT);  /* Loop for each alternative */
9104
4.54k
return TRUE;
9105
48.3k
}
9106
9107
9108
9109
/*************************************************
9110
*   Scan compiled regex for recursion reference  *
9111
*************************************************/
9112
9113
/* This function scans through a compiled pattern until it finds an instance of
9114
OP_RECURSE.
9115
9116
Arguments:
9117
  code        points to start of expression
9118
  utf         TRUE in UTF mode
9119
9120
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
9121
*/
9122
9123
static PCRE2_UCHAR *
9124
find_recurse(PCRE2_UCHAR *code, BOOL utf)
9125
1.30M
{
9126
1.30M
for (;;)
9127
40.1M
  {
9128
40.1M
  PCRE2_UCHAR c = *code;
9129
40.1M
  if (c == OP_END) return NULL;
9130
40.1M
  if (c == OP_RECURSE) return code;
9131
9132
  /* XCLASS is used for classes that cannot be represented just by a bit map.
9133
  This includes negated single high-valued characters. ECLASS is used for
9134
  classes that use set operations internally. CALLOUT_STR is used for
9135
  callouts with string arguments. In each case the length in the table is
9136
  zero; the actual length is stored in the compiled code. */
9137
9138
38.8M
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
9139
38.7M
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
9140
9141
  /* Otherwise, we can get the item's length from the table, except that for
9142
  repeated character types, we have to test for \p and \P, which have an extra
9143
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
9144
  we must add in its length. */
9145
9146
38.7M
  else
9147
38.7M
    {
9148
38.7M
    switch(c)
9149
38.7M
      {
9150
156k
      case OP_TYPESTAR:
9151
209k
      case OP_TYPEMINSTAR:
9152
313k
      case OP_TYPEPLUS:
9153
368k
      case OP_TYPEMINPLUS:
9154
427k
      case OP_TYPEQUERY:
9155
453k
      case OP_TYPEMINQUERY:
9156
465k
      case OP_TYPEPOSSTAR:
9157
467k
      case OP_TYPEPOSPLUS:
9158
470k
      case OP_TYPEPOSQUERY:
9159
470k
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
9160
470k
      break;
9161
9162
28.9k
      case OP_TYPEPOSUPTO:
9163
61.5k
      case OP_TYPEUPTO:
9164
67.1k
      case OP_TYPEMINUPTO:
9165
80.9k
      case OP_TYPEEXACT:
9166
80.9k
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
9167
13.4k
        code += 2;
9168
80.9k
      break;
9169
9170
29.2k
      case OP_MARK:
9171
31.4k
      case OP_COMMIT_ARG:
9172
38.1k
      case OP_PRUNE_ARG:
9173
65.9k
      case OP_SKIP_ARG:
9174
93.2k
      case OP_THEN_ARG:
9175
93.2k
      code += code[1];
9176
93.2k
      break;
9177
38.7M
      }
9178
9179
    /* Add in the fixed length from the table */
9180
9181
38.7M
    code += PRIV(OP_lengths)[c];
9182
9183
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
9184
    be followed by a multi-unit character. The length in the table is a
9185
    minimum, so we have to arrange to skip the extra units. */
9186
9187
38.7M
#ifdef MAYBE_UTF_MULTI
9188
38.7M
    if (utf) switch(c)
9189
559k
      {
9190
40.9k
      case OP_CHAR:
9191
233k
      case OP_CHARI:
9192
233k
      case OP_NOT:
9193
233k
      case OP_NOTI:
9194
234k
      case OP_EXACT:
9195
234k
      case OP_EXACTI:
9196
234k
      case OP_NOTEXACT:
9197
235k
      case OP_NOTEXACTI:
9198
236k
      case OP_UPTO:
9199
237k
      case OP_UPTOI:
9200
237k
      case OP_NOTUPTO:
9201
237k
      case OP_NOTUPTOI:
9202
237k
      case OP_MINUPTO:
9203
238k
      case OP_MINUPTOI:
9204
238k
      case OP_NOTMINUPTO:
9205
238k
      case OP_NOTMINUPTOI:
9206
238k
      case OP_POSUPTO:
9207
241k
      case OP_POSUPTOI:
9208
241k
      case OP_NOTPOSUPTO:
9209
241k
      case OP_NOTPOSUPTOI:
9210
242k
      case OP_STAR:
9211
246k
      case OP_STARI:
9212
246k
      case OP_NOTSTAR:
9213
247k
      case OP_NOTSTARI:
9214
247k
      case OP_MINSTAR:
9215
248k
      case OP_MINSTARI:
9216
248k
      case OP_NOTMINSTAR:
9217
248k
      case OP_NOTMINSTARI:
9218
248k
      case OP_POSSTAR:
9219
248k
      case OP_POSSTARI:
9220
249k
      case OP_NOTPOSSTAR:
9221
250k
      case OP_NOTPOSSTARI:
9222
251k
      case OP_PLUS:
9223
253k
      case OP_PLUSI:
9224
253k
      case OP_NOTPLUS:
9225
253k
      case OP_NOTPLUSI:
9226
254k
      case OP_MINPLUS:
9227
256k
      case OP_MINPLUSI:
9228
256k
      case OP_NOTMINPLUS:
9229
256k
      case OP_NOTMINPLUSI:
9230
256k
      case OP_POSPLUS:
9231
256k
      case OP_POSPLUSI:
9232
256k
      case OP_NOTPOSPLUS:
9233
256k
      case OP_NOTPOSPLUSI:
9234
258k
      case OP_QUERY:
9235
259k
      case OP_QUERYI:
9236
260k
      case OP_NOTQUERY:
9237
260k
      case OP_NOTQUERYI:
9238
261k
      case OP_MINQUERY:
9239
262k
      case OP_MINQUERYI:
9240
262k
      case OP_NOTMINQUERY:
9241
262k
      case OP_NOTMINQUERYI:
9242
262k
      case OP_POSQUERY:
9243
262k
      case OP_POSQUERYI:
9244
262k
      case OP_NOTPOSQUERY:
9245
262k
      case OP_NOTPOSQUERYI:
9246
262k
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9247
262k
      break;
9248
559k
      }
9249
#else
9250
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9251
#endif  /* MAYBE_UTF_MULTI */
9252
38.7M
    }
9253
38.8M
  }
9254
1.30M
}
9255
9256
9257
9258
/*************************************************
9259
*    Check for asserted fixed first code unit    *
9260
*************************************************/
9261
9262
/* During compilation, the "first code unit" settings from forward assertions
9263
are discarded, because they can cause conflicts with actual literals that
9264
follow. However, if we end up without a first code unit setting for an
9265
unanchored pattern, it is worth scanning the regex to see if there is an
9266
initial asserted first code unit. If all branches start with the same asserted
9267
code unit, or with a non-conditional bracket all of whose alternatives start
9268
with the same asserted code unit (recurse ad lib), then we return that code
9269
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9270
REQ_NONE in the flags.
9271
9272
Arguments:
9273
  code       points to start of compiled pattern
9274
  flags      points to the first code unit flags
9275
  inassert   non-zero if in an assertion
9276
9277
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9278
*/
9279
9280
static uint32_t
9281
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9282
48.4k
{
9283
48.4k
uint32_t c = 0;
9284
48.4k
uint32_t cflags = REQ_NONE;
9285
9286
48.4k
*flags = REQ_NONE;
9287
63.3k
do {
9288
63.3k
   uint32_t d;
9289
63.3k
   uint32_t dflags;
9290
63.3k
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9291
58.4k
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9292
63.3k
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9293
63.3k
   PCRE2_UCHAR op = *scode;
9294
9295
63.3k
   switch(op)
9296
63.3k
     {
9297
32.5k
     default:
9298
32.5k
     return 0;
9299
9300
1.11k
     case OP_BRA:
9301
1.19k
     case OP_BRAPOS:
9302
6.05k
     case OP_CBRA:
9303
6.14k
     case OP_SCBRA:
9304
6.34k
     case OP_CBRAPOS:
9305
6.53k
     case OP_SCBRAPOS:
9306
7.23k
     case OP_ASSERT:
9307
7.75k
     case OP_ASSERT_NA:
9308
8.07k
     case OP_ONCE:
9309
8.16k
     case OP_SCRIPT_RUN:
9310
8.16k
     d = find_firstassertedcu(scode, &dflags, inassert +
9311
8.16k
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9312
8.16k
     if (dflags >= REQ_NONE) return 0;
9313
1.85k
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9314
1.25k
       else if (c != d || cflags != dflags) return 0;
9315
1.83k
     break;
9316
9317
1.83k
     case OP_EXACT:
9318
448
     scode += IMM2_SIZE;
9319
448
     PCRE2_FALLTHROUGH /* Fall through */
9320
9321
10.0k
     case OP_CHAR:
9322
10.7k
     case OP_PLUS:
9323
11.1k
     case OP_MINPLUS:
9324
14.8k
     case OP_POSPLUS:
9325
14.8k
     if (inassert == 0) return 0;
9326
9.20k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9327
8.14k
       else if (c != scode[1]) return 0;
9328
9.17k
     break;
9329
9330
9.17k
     case OP_EXACTI:
9331
78
     scode += IMM2_SIZE;
9332
78
     PCRE2_FALLTHROUGH /* Fall through */
9333
9334
4.67k
     case OP_CHARI:
9335
6.76k
     case OP_PLUSI:
9336
7.34k
     case OP_MINPLUSI:
9337
7.78k
     case OP_POSPLUSI:
9338
7.78k
     if (inassert == 0) return 0;
9339
9340
     /* If the character is more than one code unit long, we cannot set its
9341
     first code unit when matching caselessly. Later scanning may pick up
9342
     multiple code units. */
9343
9344
6.07k
#ifdef SUPPORT_UNICODE
9345
6.07k
#if PCRE2_CODE_UNIT_WIDTH == 8
9346
6.07k
     if (scode[1] >= 0x80) return 0;
9347
#elif PCRE2_CODE_UNIT_WIDTH == 16
9348
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9349
#endif
9350
6.04k
#endif
9351
9352
6.04k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9353
5.39k
       else if (c != scode[1]) return 0;
9354
6.03k
     break;
9355
63.3k
     }
9356
9357
17.0k
   code += GET(code, 1);
9358
17.0k
   }
9359
48.4k
while (*code == OP_ALT);
9360
9361
2.14k
*flags = cflags;
9362
2.14k
return c;
9363
48.4k
}
9364
9365
9366
9367
/*************************************************
9368
*             Skip in parsed pattern             *
9369
*************************************************/
9370
9371
/* This function is called to skip parts of the parsed pattern when finding the
9372
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9373
the end of the branch, it is called to skip over an internal lookaround or
9374
(DEFINE) group, and it is also called to skip to the end of a class, during
9375
which it will never encounter nested groups (but there's no need to have
9376
special code for that).
9377
9378
When called to find the end of a branch or group, pptr must point to the first
9379
meta code inside the branch, not the branch-starting code. In other cases it
9380
can point to the item that causes the function to be called.
9381
9382
Arguments:
9383
  pptr       current pointer to skip from
9384
  skiptype   PSKIP_CLASS when skipping to end of class
9385
             PSKIP_ALT when META_ALT ends the skip
9386
             PSKIP_KET when only META_KET ends the skip
9387
9388
Returns:     new value of pptr
9389
             NULL if META_END is reached - should never occur
9390
               or for an unknown meta value - likewise
9391
*/
9392
9393
static uint32_t *
9394
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9395
59.9k
{
9396
59.9k
uint32_t nestlevel = 0;
9397
9398
9.34M
for (;; pptr++)
9399
9.40M
  {
9400
9.40M
  uint32_t meta = META_CODE(*pptr);
9401
9402
9.40M
  switch(meta)
9403
9.40M
    {
9404
7.86M
    default:  /* Just skip over most items */
9405
7.86M
    if (meta < META_END) continue;  /* Literal */
9406
1.15M
    break;
9407
9408
    /* The parsed regex is malformed; we have reached the end and did
9409
    not find the end of the construct which we are skipping over. */
9410
9411
    /* LCOV_EXCL_START */
9412
1.15M
    case META_END:
9413
0
    PCRE2_DEBUG_UNREACHABLE();
9414
0
    return NULL;
9415
    /* LCOV_EXCL_STOP */
9416
9417
    /* The data for these items is variable in length. */
9418
9419
4.45k
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9420
4.45k
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9421
4.45k
    break;
9422
9423
281k
    case META_ESCAPE:
9424
281k
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9425
15.6k
      pptr += 1;     /* Skip prop data */
9426
281k
    break;
9427
9428
2.42k
    case META_MARK:     /* Add the length of the name. */
9429
4.15k
    case META_COMMIT_ARG:
9430
10.5k
    case META_PRUNE_ARG:
9431
11.4k
    case META_SKIP_ARG:
9432
24.4k
    case META_THEN_ARG:
9433
24.4k
    pptr += pptr[1];
9434
24.4k
    break;
9435
9436
    /* These are the "active" items in this loop. */
9437
9438
44.6k
    case META_CLASS_END:
9439
44.6k
    if (skiptype == PSKIP_CLASS) return pptr;
9440
41.8k
    break;
9441
9442
41.8k
    case META_ATOMIC:
9443
361k
    case META_CAPTURE:
9444
363k
    case META_COND_ASSERT:
9445
363k
    case META_COND_DEFINE:
9446
363k
    case META_COND_NAME:
9447
365k
    case META_COND_NUMBER:
9448
366k
    case META_COND_RNAME:
9449
378k
    case META_COND_RNUMBER:
9450
378k
    case META_COND_VERSION:
9451
381k
    case META_SCS:
9452
421k
    case META_LOOKAHEAD:
9453
431k
    case META_LOOKAHEADNOT:
9454
434k
    case META_LOOKAHEAD_NA:
9455
437k
    case META_LOOKBEHIND:
9456
449k
    case META_LOOKBEHINDNOT:
9457
453k
    case META_LOOKBEHIND_NA:
9458
506k
    case META_NOCAPTURE:
9459
506k
    case META_SCRIPT_RUN:
9460
506k
    nestlevel++;
9461
506k
    break;
9462
9463
112k
    case META_ALT:
9464
112k
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9465
112k
    break;
9466
9467
563k
    case META_KET:
9468
563k
    if (nestlevel == 0) return pptr;
9469
506k
    nestlevel--;
9470
506k
    break;
9471
9.40M
    }
9472
9473
  /* The extra data item length for each meta is in a table. */
9474
9475
2.63M
  meta = (meta >> 16) & 0x7fff;
9476
2.63M
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9477
2.63M
  pptr += meta_extra_lengths[meta];
9478
2.63M
  }
9479
9480
/* LCOV_EXCL_START */
9481
59.9k
PCRE2_UNREACHABLE(); /* Control never reaches here */
9482
/* LCOV_EXCL_STOP */
9483
59.9k
}
9484
9485
9486
9487
/*************************************************
9488
*       Find length of a parsed group            *
9489
*************************************************/
9490
9491
/* This is called for nested groups within a branch of a lookbehind whose
9492
length is being computed. On entry, the pointer must be at the first element
9493
after the group initializing code. On exit it points to OP_KET. Caching is used
9494
to improve processing speed when the same capturing group occurs many times.
9495
9496
Arguments:
9497
  pptrptr     pointer to pointer in the parsed pattern
9498
  minptr      where to return the minimum length
9499
  isinline    FALSE if a reference or recursion; TRUE for inline group
9500
  errcodeptr  pointer to the errorcode
9501
  lcptr       pointer to the loop counter
9502
  group       number of captured group or -1 for a non-capturing group
9503
  recurses    chain of recurse_check to catch mutual recursion
9504
  cb          pointer to the compile data
9505
9506
Returns:      the maximum group length or a negative number
9507
*/
9508
9509
static int
9510
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9511
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9512
180k
{
9513
180k
uint32_t *gi = cb->groupinfo + 2 * group;
9514
180k
int branchlength, branchminlength;
9515
180k
int grouplength = -1;
9516
180k
int groupminlength = INT_MAX;
9517
9518
/* The cache can be used only if there is no possibility of there being two
9519
groups with the same number. We do not need to set the end pointer for a group
9520
that is being processed as a back reference or recursion, but we must do so for
9521
an inline group. */
9522
9523
180k
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9524
5.49k
  {
9525
5.49k
  uint32_t groupinfo = gi[0];
9526
5.49k
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9527
5.49k
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9528
1.15k
    {
9529
1.15k
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9530
1.15k
    *minptr = gi[1];
9531
1.15k
    return groupinfo & GI_FIXED_LENGTH_MASK;
9532
1.15k
    }
9533
5.49k
  }
9534
9535
/* Scan the group. In this case we find the end pointer of necessity. */
9536
9537
178k
for(;;)
9538
189k
  {
9539
189k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9540
189k
    recurses, cb);
9541
189k
  if (branchlength < 0) goto ISNOTFIXED;
9542
186k
  if (branchlength > grouplength) grouplength = branchlength;
9543
186k
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9544
186k
  if (**pptrptr == META_KET) break;
9545
10.6k
  *pptrptr += 1;   /* Skip META_ALT */
9546
10.6k
  }
9547
9548
176k
if (group > 0)
9549
152k
  {
9550
152k
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9551
152k
  gi[1] = groupminlength;
9552
152k
  }
9553
9554
176k
*minptr = groupminlength;
9555
176k
return grouplength;
9556
9557
2.93k
ISNOTFIXED:
9558
2.93k
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9559
2.93k
return -1;
9560
178k
}
9561
9562
9563
9564
/*************************************************
9565
*        Find length of a parsed branch          *
9566
*************************************************/
9567
9568
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9569
giving an error if the length is not limited. On entry, *pptrptr points to the
9570
first element inside the branch. On exit it is set to point to the ALT or KET.
9571
9572
Arguments:
9573
  pptrptr     pointer to pointer in the parsed pattern
9574
  minptr      where to return the minimum length
9575
  errcodeptr  pointer to error code
9576
  lcptr       pointer to loop counter
9577
  recurses    chain of recurse_check to catch mutual recursion
9578
  cb          pointer to compile block
9579
9580
Returns:      the maximum length, or a negative value on error
9581
*/
9582
9583
static int
9584
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9585
  parsed_recurse_check *recurses, compile_block *cb)
9586
215k
{
9587
215k
int branchlength = 0;
9588
215k
int branchminlength = 0;
9589
215k
int grouplength, groupminlength;
9590
215k
uint32_t lastitemlength = 0;
9591
215k
uint32_t lastitemminlength = 0;
9592
215k
uint32_t *pptr = *pptrptr;
9593
215k
PCRE2_SIZE offset;
9594
215k
parsed_recurse_check this_recurse;
9595
9596
/* A large and/or complex regex can take too long to process. This can happen
9597
more often when (?| groups are present in the pattern because their length
9598
cannot be cached. */
9599
9600
215k
if ((*lcptr)++ > 2000)
9601
30
  {
9602
30
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9603
30
  return -1;
9604
30
  }
9605
9606
/* Scan the branch, accumulating the length. */
9607
9608
943k
for (;; pptr++)
9609
1.15M
  {
9610
1.15M
  parsed_recurse_check *r;
9611
1.15M
  uint32_t *gptr, *gptrend;
9612
1.15M
  uint32_t escape;
9613
1.15M
  uint32_t min, max;
9614
1.15M
  uint32_t group = 0;
9615
1.15M
  uint32_t itemlength = 0;
9616
1.15M
  uint32_t itemminlength = 0;
9617
9618
1.15M
  if (*pptr < META_END)
9619
560k
    {
9620
560k
    itemlength = itemminlength = 1;
9621
560k
    }
9622
9623
598k
  else switch (META_CODE(*pptr))
9624
598k
    {
9625
194k
    case META_KET:
9626
209k
    case META_ALT:
9627
209k
    goto EXIT;
9628
9629
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9630
    actual termination. */
9631
9632
2.07k
    case META_ACCEPT:
9633
2.88k
    case META_FAIL:
9634
2.88k
    pptr = parsed_skip(pptr, PSKIP_ALT);
9635
2.88k
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9636
2.88k
    goto EXIT;
9637
9638
2.88k
    case META_MARK:
9639
3.46k
    case META_COMMIT_ARG:
9640
7.43k
    case META_PRUNE_ARG:
9641
7.84k
    case META_SKIP_ARG:
9642
12.4k
    case META_THEN_ARG:
9643
12.4k
    pptr += pptr[1] + 1;
9644
12.4k
    break;
9645
9646
2.97k
    case META_CIRCUMFLEX:
9647
3.37k
    case META_COMMIT:
9648
5.42k
    case META_DOLLAR:
9649
5.96k
    case META_PRUNE:
9650
9.21k
    case META_SKIP:
9651
10.6k
    case META_THEN:
9652
10.6k
    break;
9653
9654
1.14k
    case META_OPTIONS:
9655
1.14k
    pptr += 2;
9656
1.14k
    break;
9657
9658
0
    case META_BIGVALUE:
9659
0
    itemlength = itemminlength = 1;
9660
0
    pptr += 1;
9661
0
    break;
9662
9663
1.14k
    case META_CLASS:
9664
2.80k
    case META_CLASS_NOT:
9665
2.80k
    itemlength = itemminlength = 1;
9666
2.80k
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9667
2.80k
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9668
2.80k
    break;
9669
9670
2.80k
    case META_CLASS_EMPTY_NOT:
9671
4.03k
    case META_DOT:
9672
4.03k
    itemlength = itemminlength = 1;
9673
4.03k
    break;
9674
9675
93.2k
    case META_CALLOUT_NUMBER:
9676
93.2k
    pptr += 3;
9677
93.2k
    break;
9678
9679
879
    case META_CALLOUT_STRING:
9680
879
    pptr += 3 + SIZEOFFSET;
9681
879
    break;
9682
9683
    /* Only some escapes consume a character. Of those, \R can match one or two
9684
    characters, but \X is never allowed because it matches an unknown number of
9685
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9686
9687
16.3k
    case META_ESCAPE:
9688
16.3k
    escape = META_DATA(*pptr);
9689
16.3k
    if (escape == ESC_X) return -1;
9690
16.3k
    if (escape == ESC_R)
9691
278
      {
9692
278
      itemminlength = 1;
9693
278
      itemlength = 2;
9694
278
      }
9695
16.0k
    else if (escape > ESC_b && escape < ESC_Z)
9696
10.0k
      {
9697
10.0k
#if PCRE2_CODE_UNIT_WIDTH != 32
9698
10.0k
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9699
0
        {
9700
0
        *errcodeptr = ERR36;
9701
0
        return -1;
9702
0
        }
9703
10.0k
#endif
9704
10.0k
      itemlength = itemminlength = 1;
9705
10.0k
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9706
10.0k
      }
9707
16.3k
    break;
9708
9709
    /* Lookaheads do not contribute to the length of this branch, but they may
9710
    contain lookbehinds within them whose lengths need to be set. */
9711
9712
21.7k
    case META_LOOKAHEAD:
9713
30.4k
    case META_LOOKAHEADNOT:
9714
31.2k
    case META_LOOKAHEAD_NA:
9715
31.6k
    case META_SCS:
9716
31.6k
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9717
31.6k
    if (*errcodeptr != 0) return -1;
9718
9719
    /* Ignore any qualifiers that follow a lookahead assertion. */
9720
9721
31.6k
    switch (pptr[1])
9722
31.6k
      {
9723
609
      case META_ASTERISK:
9724
813
      case META_ASTERISK_PLUS:
9725
3.63k
      case META_ASTERISK_QUERY:
9726
4.46k
      case META_PLUS:
9727
4.66k
      case META_PLUS_PLUS:
9728
8.40k
      case META_PLUS_QUERY:
9729
8.79k
      case META_QUERY:
9730
9.09k
      case META_QUERY_PLUS:
9731
9.55k
      case META_QUERY_QUERY:
9732
9.55k
      pptr++;
9733
9.55k
      break;
9734
9735
355
      case META_MINMAX:
9736
628
      case META_MINMAX_PLUS:
9737
877
      case META_MINMAX_QUERY:
9738
877
      pptr += 3;
9739
877
      break;
9740
9741
21.2k
      default:
9742
21.2k
      break;
9743
31.6k
      }
9744
31.6k
    break;
9745
9746
    /* A nested lookbehind does not contribute any length to this lookbehind,
9747
    but must itself be checked and have its lengths set. Note that
9748
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9749
    of the group, so no need to update it here. */
9750
9751
31.6k
    case META_LOOKBEHIND:
9752
3.94k
    case META_LOOKBEHINDNOT:
9753
5.32k
    case META_LOOKBEHIND_NA:
9754
5.32k
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9755
161
      return -1;
9756
5.16k
    break;
9757
9758
    /* Back references and recursions are handled by very similar code. At this
9759
    stage, the names generated in the parsing pass are available, but the main
9760
    name table has not yet been created. So for the named varieties, scan the
9761
    list of names in order to get the number of the first one in the pattern,
9762
    and whether or not this name is duplicated. */
9763
9764
5.16k
    case META_BACKREF_BYNAME:
9765
79
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9766
4
      goto ISNOTFIXED;
9767
75
    PCRE2_FALLTHROUGH /* Fall through */
9768
75
9769
331
    case META_RECURSE_BYNAME:
9770
331
      {
9771
331
      PCRE2_SPTR name;
9772
331
      BOOL is_dupname = FALSE;
9773
331
      named_group *ng;
9774
331
      uint32_t meta_code = META_CODE(*pptr);
9775
331
      uint32_t length = *(++pptr);
9776
9777
331
      GETPLUSOFFSET(offset, pptr);
9778
331
      name = cb->start_pattern + offset;
9779
331
      ng = PRIV(compile_find_named_group)(name, length, cb);
9780
9781
331
      if (ng == NULL)
9782
3
        {
9783
3
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9784
3
        cb->erroroffset = offset;
9785
3
        return -1;
9786
3
        }
9787
9788
328
      group = ng->number;
9789
328
      is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;
9790
9791
      /* A numerical back reference can be fixed length if duplicate capturing
9792
      groups are not being used. A non-duplicate named back reference can also
9793
      be handled. */
9794
9795
328
      if (meta_code == META_RECURSE_BYNAME ||
9796
72
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9797
322
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9798
328
      }
9799
6
    goto ISNOTFIXED;                     /* Duplicate name or number */
9800
9801
    /* The offset values for back references < 10 are in a separate vector
9802
    because otherwise they would use more than two parsed pattern elements on
9803
    64-bit systems. */
9804
9805
1.26k
    case META_BACKREF:
9806
1.26k
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9807
1.24k
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9808
24
      goto ISNOTFIXED;
9809
1.24k
    group = META_DATA(*pptr);
9810
1.24k
    if (group < 10)
9811
576
      {
9812
576
      offset = cb->small_ref_offset[group];
9813
576
      goto RECURSE_OR_BACKREF_LENGTH;
9814
576
      }
9815
9816
669
    PCRE2_FALLTHROUGH /* Fall through */
9817
669
    /* For groups >= 10 - picking up group twice does no harm. */
9818
669
9819
669
    /* A true recursion implies not fixed length, but a subroutine call may
9820
669
    be OK. Back reference "recursions" are also failed. */
9821
669
9822
53.0k
    case META_RECURSE:
9823
53.0k
    group = META_DATA(*pptr);
9824
53.0k
    GETPLUSOFFSET(offset, pptr);
9825
9826
53.9k
    RECURSE_OR_BACKREF_LENGTH:
9827
53.9k
    if (group > cb->bracount)
9828
122
      {
9829
122
      cb->erroroffset = offset;
9830
122
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9831
122
      return -1;
9832
122
      }
9833
53.7k
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9834
99.3M
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9835
99.3M
      {
9836
99.3M
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9837
99.3M
        else if (*gptr == (META_CAPTURE | group)) break;
9838
99.3M
      }
9839
9840
    /* We must start the search for the end of the group at the first meta code
9841
    inside the group. Otherwise it will be treated as an enclosed group. */
9842
9843
53.7k
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9844
53.7k
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9845
53.7k
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9846
275k
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9847
53.7k
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9848
53.7k
    this_recurse.prev = recurses;
9849
53.7k
    this_recurse.groupptr = gptr;
9850
9851
    /* We do not need to know the position of the end of the group, that is,
9852
    gptr is not used after the call to get_grouplength(). Setting the second
9853
    argument FALSE stops it scanning for the end when the length can be found
9854
    in the cache. */
9855
9856
53.7k
    gptr++;
9857
53.7k
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9858
53.7k
      lcptr, group, &this_recurse, cb);
9859
53.7k
    if (grouplength < 0)
9860
970
      {
9861
970
      if (*errcodeptr == 0) goto ISNOTFIXED;
9862
967
      return -1;  /* Error already set */
9863
970
      }
9864
52.7k
    itemlength = grouplength;
9865
52.7k
    itemminlength = groupminlength;
9866
52.7k
    break;
9867
9868
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9869
    the length of this branch. Skip from the following item to the next
9870
    unpaired ket. */
9871
9872
70
    case META_COND_DEFINE:
9873
70
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9874
70
    break;
9875
9876
    /* Check other nested groups - advance past the initial data for each type
9877
    and then seek a fixed length with get_grouplength(). */
9878
9879
258
    case META_COND_NAME:
9880
1.47k
    case META_COND_NUMBER:
9881
1.69k
    case META_COND_RNAME:
9882
3.07k
    case META_COND_RNUMBER:
9883
3.07k
    pptr += 2 + SIZEOFFSET;
9884
3.07k
    goto CHECK_GROUP;
9885
9886
555
    case META_COND_ASSERT:
9887
555
    pptr += 1;
9888
555
    goto CHECK_GROUP;
9889
9890
566
    case META_COND_VERSION:
9891
566
    pptr += 4;
9892
566
    goto CHECK_GROUP;
9893
9894
102k
    case META_CAPTURE:
9895
102k
    group = META_DATA(*pptr);
9896
102k
    PCRE2_FALLTHROUGH /* Fall through */
9897
9898
103k
    case META_ATOMIC:
9899
121k
    case META_NOCAPTURE:
9900
122k
    case META_SCRIPT_RUN:
9901
122k
    pptr++;
9902
126k
    CHECK_GROUP:
9903
126k
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9904
126k
      lcptr, group, recurses, cb);
9905
126k
    if (grouplength < 0) return -1;
9906
124k
    itemlength = grouplength;
9907
124k
    itemminlength = groupminlength;
9908
124k
    break;
9909
9910
14.9k
    case META_QUERY:
9911
22.6k
    case META_QUERY_PLUS:
9912
23.0k
    case META_QUERY_QUERY:
9913
23.0k
    min = 0;
9914
23.0k
    max = 1;
9915
23.0k
    goto REPETITION;
9916
9917
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9918
    must subtract the length that has already been added. */
9919
9920
2.30k
    case META_MINMAX:
9921
2.82k
    case META_MINMAX_PLUS:
9922
4.41k
    case META_MINMAX_QUERY:
9923
4.41k
    min = pptr[1];
9924
4.41k
    max = pptr[2];
9925
4.41k
    pptr += 2;
9926
9927
27.4k
    REPETITION:
9928
27.4k
    if (max != REPEAT_UNLIMITED)
9929
27.4k
      {
9930
27.4k
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9931
25.6k
          max != 0 &&
9932
23.4k
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9933
3
        {
9934
3
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9935
3
        return -1;
9936
3
        }
9937
27.4k
      if (min == 0) branchminlength -= lastitemminlength;
9938
2.04k
        else itemminlength = (min - 1) * lastitemminlength;
9939
27.4k
      if (max == 0) branchlength -= lastitemlength;
9940
25.2k
        else itemlength = (max - 1) * lastitemlength;
9941
27.4k
      break;
9942
27.4k
      }
9943
3
    PCRE2_FALLTHROUGH /* Fall through */
9944
3
9945
3
    /* Any other item means this branch does not have a fixed length. */
9946
3
9947
137
    default:
9948
212
    ISNOTFIXED:
9949
212
    *errcodeptr = ERR25;   /* Not fixed length */
9950
212
    return -1;
9951
598k
    }
9952
9953
  /* Add the item length to the branchlength, checking for integer overflow and
9954
  for the branch length exceeding the overall limit. Later, if there is at
9955
  least one variable-length branch in the group, there is a test for the
9956
  (smaller) variable-length branch length limit. */
9957
9958
943k
  if (INT_MAX - branchlength < (int)itemlength ||
9959
943k
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9960
34
    {
9961
34
    *errcodeptr = ERR87;
9962
34
    return -1;
9963
34
    }
9964
9965
943k
  branchminlength += itemminlength;
9966
9967
  /* Save this item length for use if the next item is a quantifier. */
9968
9969
943k
  lastitemlength = itemlength;
9970
943k
  lastitemminlength = itemminlength;
9971
943k
  }
9972
9973
212k
EXIT:
9974
212k
*pptrptr = pptr;
9975
212k
*minptr = branchminlength;
9976
212k
return branchlength;
9977
9978
/* LCOV_EXCL_START */
9979
0
PARSED_SKIP_FAILED:
9980
0
PCRE2_DEBUG_UNREACHABLE();
9981
0
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9982
0
return -1;
9983
/* LCOV_EXCL_STOP */
9984
215k
}
9985
9986
9987
9988
/*************************************************
9989
*        Set lengths in a lookbehind             *
9990
*************************************************/
9991
9992
/* This function is called for each lookbehind, to set the lengths in its
9993
branches. An error occurs if any branch does not have a limited maximum length
9994
that is less than the limit (65535). On exit, the pointer must be left on the
9995
final ket.
9996
9997
The function also maintains the max_lookbehind value. Any lookbehind branch
9998
that contains a nested lookbehind may actually look further back than the
9999
length of the branch. The additional amount is passed back from
10000
get_branchlength() as an "extra" value.
10001
10002
Arguments:
10003
  pptrptr     pointer to pointer in the parsed pattern
10004
  errcodeptr  pointer to error code
10005
  lcptr       pointer to loop counter
10006
  recurses    chain of recurse_check to catch mutual recursion
10007
  cb          pointer to compile block
10008
10009
Returns:      TRUE if all is well
10010
              FALSE otherwise, with error code and offset set
10011
*/
10012
10013
static BOOL
10014
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
10015
  parsed_recurse_check *recurses, compile_block *cb)
10016
21.1k
{
10017
21.1k
PCRE2_SIZE offset;
10018
21.1k
uint32_t *bptr = *pptrptr;
10019
21.1k
uint32_t *gbptr = bptr;
10020
21.1k
int maxlength = 0;
10021
21.1k
int minlength = INT_MAX;
10022
21.1k
BOOL variable = FALSE;
10023
10024
21.1k
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
10025
21.1k
*pptrptr += SIZEOFFSET;
10026
10027
/* Each branch can have a different maximum length, but we can keep only a
10028
single minimum for the whole group, because there's nowhere to save individual
10029
values in the META_ALT item. */
10030
10031
21.1k
do
10032
26.2k
  {
10033
26.2k
  int branchlength, branchminlength;
10034
10035
26.2k
  *pptrptr += 1;
10036
26.2k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
10037
26.2k
    recurses, cb);
10038
10039
26.2k
  if (branchlength < 0)
10040
635
    {
10041
    /* The errorcode and offset may already be set from a nested lookbehind. */
10042
635
    if (*errcodeptr == 0) *errcodeptr = ERR25;
10043
635
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
10044
635
    return FALSE;
10045
635
    }
10046
10047
25.5k
  if (branchlength != branchminlength) variable = TRUE;
10048
25.5k
  if (branchminlength < minlength) minlength = branchminlength;
10049
25.5k
  if (branchlength > maxlength) maxlength = branchlength;
10050
25.5k
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
10051
25.5k
  *bptr |= branchlength;  /* branchlength never more than 65535 */
10052
25.5k
  bptr = *pptrptr;
10053
25.5k
  }
10054
25.5k
while (META_CODE(*bptr) == META_ALT);
10055
10056
/* If any branch is of variable length, the whole lookbehind is of variable
10057
length. If the maximum length of any branch exceeds the maximum for variable
10058
lookbehinds, give an error. Otherwise, the minimum length is set in the word
10059
that follows the original group META value. For a fixed-length lookbehind, this
10060
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
10061
possibly different) length. */
10062
10063
20.5k
if (variable)
10064
7.95k
  {
10065
7.95k
  gbptr[1] = minlength;
10066
7.95k
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
10067
88
    {
10068
88
    *errcodeptr = ERR100;
10069
88
    cb->erroroffset = offset;
10070
88
    return FALSE;
10071
88
    }
10072
7.95k
  }
10073
12.5k
else gbptr[1] = LOOKBEHIND_MAX;
10074
10075
20.4k
return TRUE;
10076
20.5k
}
10077
10078
10079
10080
/*************************************************
10081
*         Check parsed pattern lookbehinds       *
10082
*************************************************/
10083
10084
/* This function is called at the end of parsing a pattern if any lookbehinds
10085
were encountered. It scans the parsed pattern for them, calling
10086
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
10087
the error offset is marked unset. The enables the functions above not to
10088
override settings from deeper nestings.
10089
10090
This function is called recursively from get_branchlength() for lookaheads in
10091
order to process any lookbehinds that they may contain. It stops when it hits a
10092
non-nested closing parenthesis in this case, returning a pointer to it.
10093
10094
Arguments
10095
  pptr      points to where to start (start of pattern or start of lookahead)
10096
  retptr    if not NULL, return the ket pointer here
10097
  recurses  chain of recurse_check to catch mutual recursion
10098
  cb        points to the compile block
10099
  lcptr     points to loop counter
10100
10101
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
10102
*/
10103
10104
static int
10105
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
10106
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
10107
37.9k
{
10108
37.9k
int errorcode = 0;
10109
37.9k
int nestlevel = 0;
10110
10111
37.9k
cb->erroroffset = PCRE2_UNSET;
10112
10113
3.59M
for (; *pptr != META_END; pptr++)
10114
3.58M
  {
10115
3.58M
  if (*pptr < META_END) continue;  /* Literal */
10116
10117
1.04M
  switch (META_CODE(*pptr))
10118
1.04M
    {
10119
    /* The following erroroffset is a bogus but safe value. This branch should
10120
    be avoided by providing a proper implementation for all supported cases
10121
    below. */
10122
10123
    /* LCOV_EXCL_START */
10124
4
    default:
10125
4
    PCRE2_DEBUG_UNREACHABLE();
10126
4
    cb->erroroffset = 0;
10127
4
    return ERR70;  /* Unrecognized meta code */
10128
    /* LCOV_EXCL_STOP */
10129
10130
59.2k
    case META_ESCAPE:
10131
59.2k
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
10132
11.8k
      pptr += 1;    /* Skip prop data */
10133
59.2k
    break;
10134
10135
214k
    case META_KET:
10136
214k
    if (--nestlevel < 0)
10137
31.6k
      {
10138
31.6k
      if (retptr != NULL) *retptr = pptr;
10139
31.6k
      return 0;
10140
31.6k
      }
10141
183k
    break;
10142
10143
183k
    case META_ATOMIC:
10144
137k
    case META_CAPTURE:
10145
139k
    case META_COND_ASSERT:
10146
140k
    case META_SCS:
10147
148k
    case META_LOOKAHEAD:
10148
149k
    case META_LOOKAHEADNOT:
10149
150k
    case META_LOOKAHEAD_NA:
10150
170k
    case META_NOCAPTURE:
10151
171k
    case META_SCRIPT_RUN:
10152
171k
    nestlevel++;
10153
171k
    break;
10154
10155
1.07k
    case META_ACCEPT:
10156
134k
    case META_ALT:
10157
159k
    case META_ASTERISK:
10158
160k
    case META_ASTERISK_PLUS:
10159
164k
    case META_ASTERISK_QUERY:
10160
166k
    case META_BACKREF:
10161
173k
    case META_CIRCUMFLEX:
10162
196k
    case META_CLASS:
10163
199k
    case META_CLASS_EMPTY:
10164
199k
    case META_CLASS_EMPTY_NOT:
10165
231k
    case META_CLASS_END:
10166
240k
    case META_CLASS_NOT:
10167
240k
    case META_COMMIT:
10168
247k
    case META_DOLLAR:
10169
273k
    case META_DOT:
10170
274k
    case META_FAIL:
10171
299k
    case META_PLUS:
10172
300k
    case META_PLUS_PLUS:
10173
303k
    case META_PLUS_QUERY:
10174
303k
    case META_PRUNE:
10175
347k
    case META_QUERY:
10176
355k
    case META_QUERY_PLUS:
10177
356k
    case META_QUERY_QUERY:
10178
356k
    case META_RANGE_ESCAPED:
10179
357k
    case META_RANGE_LITERAL:
10180
359k
    case META_SKIP:
10181
360k
    case META_THEN:
10182
360k
    break;
10183
10184
1.29k
    case META_OFFSET:
10185
28.3k
    case META_RECURSE:
10186
28.3k
    pptr += SIZEOFFSET;
10187
28.3k
    break;
10188
10189
1.42k
    case META_BACKREF_BYNAME:
10190
1.73k
    case META_RECURSE_BYNAME:
10191
1.73k
    pptr += 1 + SIZEOFFSET;
10192
1.73k
    break;
10193
10194
106
    case META_COND_DEFINE:
10195
106
    pptr += SIZEOFFSET;
10196
106
    nestlevel++;
10197
106
    break;
10198
10199
494
    case META_COND_NAME:
10200
1.22k
    case META_COND_NUMBER:
10201
1.62k
    case META_COND_RNAME:
10202
12.2k
    case META_COND_RNUMBER:
10203
12.2k
    pptr += 1 + SIZEOFFSET;
10204
12.2k
    nestlevel++;
10205
12.2k
    break;
10206
10207
248
    case META_COND_VERSION:
10208
248
    pptr += 3;
10209
248
    nestlevel++;
10210
248
    break;
10211
10212
1.48k
    case META_CALLOUT_STRING:
10213
1.48k
    pptr += 3 + SIZEOFFSET;
10214
1.48k
    break;
10215
10216
0
    case META_BIGVALUE:
10217
883
    case META_POSIX:
10218
1.11k
    case META_POSIX_NEG:
10219
1.37k
    case META_CAPTURE_NAME:
10220
2.87k
    case META_CAPTURE_NUMBER:
10221
2.87k
    pptr += 1;
10222
2.87k
    break;
10223
10224
12.2k
    case META_MINMAX:
10225
12.6k
    case META_MINMAX_QUERY:
10226
15.4k
    case META_MINMAX_PLUS:
10227
17.2k
    case META_OPTIONS:
10228
17.2k
    pptr += 2;
10229
17.2k
    break;
10230
10231
151k
    case META_CALLOUT_NUMBER:
10232
151k
    pptr += 3;
10233
151k
    break;
10234
10235
437
    case META_MARK:
10236
957
    case META_COMMIT_ARG:
10237
1.74k
    case META_PRUNE_ARG:
10238
2.21k
    case META_SKIP_ARG:
10239
10.5k
    case META_THEN_ARG:
10240
10.5k
    pptr += 1 + pptr[1];
10241
10.5k
    break;
10242
10243
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10244
    the final ket of the group, so no need to update it here. */
10245
10246
4.60k
    case META_LOOKBEHIND:
10247
12.4k
    case META_LOOKBEHINDNOT:
10248
15.8k
    case META_LOOKBEHIND_NA:
10249
15.8k
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10250
562
      return errorcode;
10251
15.2k
    break;
10252
1.04M
    }
10253
1.04M
  }
10254
10255
5.74k
return 0;
10256
37.9k
}
10257
10258
10259
10260
/*************************************************
10261
*     External function to compile a pattern     *
10262
*************************************************/
10263
10264
/* This function reads a regular expression in the form of a string and returns
10265
a pointer to a block of store holding a compiled version of the expression.
10266
10267
Arguments:
10268
  pattern       the regular expression
10269
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10270
  options       option bits
10271
  errorptr      pointer to errorcode
10272
  erroroffset   pointer to error offset
10273
  ccontext      points to a compile context or is NULL
10274
10275
Returns:        pointer to compiled data block, or NULL on error,
10276
                with errorcode and erroroffset set
10277
*/
10278
10279
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10280
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10281
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10282
87.5k
{
10283
87.5k
BOOL utf;                             /* Set TRUE for UTF mode */
10284
87.5k
BOOL ucp;                             /* Set TRUE for UCP mode */
10285
87.5k
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10286
87.5k
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10287
87.5k
pcre2_real_code *re = NULL;           /* What we will return */
10288
87.5k
compile_block cb;                     /* "Static" compile-time data */
10289
87.5k
const uint8_t *tables;                /* Char tables base pointer */
10290
10291
87.5k
PCRE2_UCHAR null_str[1] = { 0xcd };   /* Dummy for handling null inputs */
10292
87.5k
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10293
87.5k
PCRE2_UCHAR *codestart;               /* Start of compiled code */
10294
87.5k
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10295
87.5k
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10296
10297
87.5k
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10298
87.5k
PCRE2_SIZE usedlength;                /* Actual length used */
10299
87.5k
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10300
87.5k
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10301
10302
87.5k
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10303
87.5k
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10304
87.5k
uint32_t setflags = 0;                /* NL and BSR set flags */
10305
87.5k
uint32_t xoptions;                    /* Flags from context, modified */
10306
10307
87.5k
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10308
87.5k
uint32_t limit_heap  = UINT32_MAX;
10309
87.5k
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10310
87.5k
uint32_t limit_depth = UINT32_MAX;
10311
10312
87.5k
int newline = 0;                      /* Unset; can be set by the pattern */
10313
87.5k
int bsr = 0;                          /* Unset; can be set by the pattern */
10314
87.5k
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10315
87.5k
int regexrc;                          /* Return from compile */
10316
10317
87.5k
uint32_t i;                           /* Local loop counter */
10318
10319
/* Enable all optimizations by default. */
10320
87.5k
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10321
87.5k
                                          PCRE2_OPTIMIZATION_ALL;
10322
10323
/* Comments at the head of this file explain about these variables. */
10324
10325
87.5k
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10326
87.5k
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10327
87.5k
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10328
10329
/* The workspace is used in different ways in the different compiling phases.
10330
It needs to be 16-bit aligned for the preliminary parsing scan. */
10331
10332
87.5k
uint32_t c16workspace[C16_WORK_SIZE];
10333
87.5k
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10334
10335
10336
/* -------------- Check arguments and set up the pattern ----------------- */
10337
10338
/* There must be error code and offset pointers. */
10339
10340
87.5k
if (errorptr == NULL)
10341
0
  {
10342
0
  if (erroroffset != NULL) *erroroffset = 0;
10343
0
  return NULL;
10344
0
  }
10345
87.5k
if (erroroffset == NULL)
10346
0
  {
10347
0
  if (errorptr != NULL) *errorptr = ERR120;
10348
0
  return NULL;
10349
0
  }
10350
87.5k
*errorptr = ERR0;
10351
87.5k
*erroroffset = 0;
10352
10353
/* There must be a pattern, but NULL is allowed with zero length. */
10354
10355
87.5k
if (pattern == NULL)
10356
0
  {
10357
0
  if (patlen == 0)
10358
0
    pattern = null_str;
10359
0
  else
10360
0
    {
10361
0
    *errorptr = ERR16;
10362
0
    return NULL;
10363
0
    }
10364
0
  }
10365
10366
/* A NULL compile context means "use a default context" */
10367
10368
87.5k
if (ccontext == NULL)
10369
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10370
10371
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10372
10373
87.5k
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10374
10375
/* Check that all undefined public option bits are zero. */
10376
10377
87.5k
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10378
87.5k
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10379
0
  {
10380
0
  *errorptr = ERR17;
10381
0
  return NULL;
10382
0
  }
10383
10384
87.5k
if ((options & PCRE2_LITERAL) != 0 &&
10385
0
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10386
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10387
0
  {
10388
0
  *errorptr = ERR92;
10389
0
  return NULL;
10390
0
  }
10391
10392
/* A zero-terminated pattern is indicated by the special length value
10393
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10394
10395
87.5k
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10396
0
  patlen = PRIV(strlen)(pattern);
10397
87.5k
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10398
10399
87.5k
if (patlen > ccontext->max_pattern_length)
10400
0
  {
10401
0
  *errorptr = ERR88;
10402
0
  return NULL;
10403
0
  }
10404
10405
/* Optimization flags in 'options' can override those in the compile context.
10406
This is because some options to disable optimizations were added before the
10407
optimization flags word existed, and we need to continue supporting them
10408
for backwards compatibility. */
10409
10410
87.5k
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10411
20.3k
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10412
87.5k
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10413
9.48k
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10414
87.5k
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10415
18.0k
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10416
10417
/* From here on, all returns from this function should end up going via the
10418
EXIT label. */
10419
10420
10421
/* ------------ Initialize the "static" compile data -------------- */
10422
10423
87.5k
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10424
10425
87.5k
cb.lcc = tables + lcc_offset;          /* Individual */
10426
87.5k
cb.fcc = tables + fcc_offset;          /*   character */
10427
87.5k
cb.cbits = tables + cbits_offset;      /*      tables */
10428
87.5k
cb.ctypes = tables + ctypes_offset;
10429
10430
87.5k
cb.assert_depth = 0;
10431
87.5k
cb.bracount = 0;
10432
87.5k
cb.cx = ccontext;
10433
87.5k
cb.dupnames = FALSE;
10434
87.5k
cb.end_pattern = pattern + patlen;
10435
87.5k
cb.erroroffset = 0;
10436
87.5k
cb.external_flags = 0;
10437
87.5k
cb.external_options = options;
10438
87.5k
cb.groupinfo = stack_groupinfo;
10439
87.5k
cb.had_recurse = FALSE;
10440
87.5k
cb.lastcapture = 0;
10441
87.5k
cb.max_lookbehind = 0;                               /* Max encountered */
10442
87.5k
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10443
87.5k
cb.name_entry_size = 0;
10444
87.5k
cb.name_table = NULL;
10445
87.5k
cb.named_groups = named_groups;
10446
87.5k
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10447
87.5k
cb.names_found = 0;
10448
87.5k
cb.parens_depth = 0;
10449
87.5k
cb.parsed_pattern = stack_parsed_pattern;
10450
87.5k
cb.req_varyopt = 0;
10451
87.5k
cb.start_code = cworkspace;
10452
87.5k
cb.start_pattern = pattern;
10453
87.5k
cb.start_workspace = cworkspace;
10454
87.5k
cb.workspace_size = COMPILE_WORK_SIZE;
10455
87.5k
cb.first_data = NULL;
10456
87.5k
cb.last_data = NULL;
10457
87.5k
#ifdef SUPPORT_WIDE_CHARS
10458
87.5k
cb.char_lists_size = 0;
10459
87.5k
#endif
10460
10461
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10462
references to help in deciding whether (.*) can be treated as anchored or not.
10463
*/
10464
10465
87.5k
cb.top_backref = 0;
10466
87.5k
cb.backref_map = 0;
10467
10468
/* Escape sequences \1 to \9 are always back references, but as they are only
10469
two characters long, only two elements can be used in the parsed_pattern
10470
vector. The first contains the reference, and we'd like to use the second to
10471
record the offset in the pattern, so that forward references to non-existent
10472
groups can be diagnosed later with an offset. However, on 64-bit systems,
10473
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10474
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10475
references have enough space for the offset to be put into the parsed pattern.
10476
*/
10477
10478
962k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10479
10480
10481
/* --------------- Start looking at the pattern --------------- */
10482
10483
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10484
the start of the pattern, and remember the offset to the actual regex. With
10485
valgrind support, make the terminator of a zero-terminated pattern
10486
inaccessible. This catches bugs that would otherwise only show up for
10487
non-zero-terminated patterns. */
10488
10489
#ifdef SUPPORT_VALGRIND
10490
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10491
#endif
10492
10493
87.5k
xoptions = ccontext->extra_options;
10494
87.5k
ptr = pattern;
10495
87.5k
skipatstart = 0;
10496
10497
87.5k
if ((options & PCRE2_LITERAL) == 0)
10498
87.5k
  {
10499
91.2k
  while (patlen - skipatstart >= 2 &&
10500
90.6k
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10501
20.2k
         ptr[skipatstart+1] == CHAR_ASTERISK)
10502
4.80k
    {
10503
81.9k
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10504
80.8k
      {
10505
80.8k
      const pso *p = pso_list + i;
10506
10507
80.8k
      if (patlen - skipatstart - 2 >= p->length &&
10508
62.1k
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10509
3.70k
        {
10510
3.70k
        uint32_t c, pp;
10511
10512
3.70k
        skipatstart += p->length + 2;
10513
3.70k
        switch(p->type)
10514
3.70k
          {
10515
837
          case PSO_OPT:
10516
837
          cb.external_options |= p->value;
10517
837
          break;
10518
10519
0
          case PSO_XOPT:
10520
0
          xoptions |= p->value;
10521
0
          break;
10522
10523
0
          case PSO_FLG:
10524
0
          setflags |= p->value;
10525
0
          break;
10526
10527
2.86k
          case PSO_NL:
10528
2.86k
          newline = p->value;
10529
2.86k
          setflags |= PCRE2_NL_SET;
10530
2.86k
          break;
10531
10532
0
          case PSO_BSR:
10533
0
          bsr = p->value;
10534
0
          setflags |= PCRE2_BSR_SET;
10535
0
          break;
10536
10537
0
          case PSO_LIMM:
10538
0
          case PSO_LIMD:
10539
0
          case PSO_LIMH:
10540
0
          c = 0;
10541
0
          pp = skipatstart;
10542
0
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10543
0
            {
10544
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10545
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10546
0
            }
10547
0
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10548
0
            {
10549
0
            errorcode = ERR60;
10550
0
            ptr += pp;
10551
0
            utf = FALSE;  /* Used by HAD_EARLY_ERROR */
10552
0
            goto HAD_EARLY_ERROR;
10553
0
            }
10554
0
          if (p->type == PSO_LIMH) limit_heap = c;
10555
0
            else if (p->type == PSO_LIMM) limit_match = c;
10556
0
            else limit_depth = c;
10557
0
          skipatstart = ++pp;
10558
0
          break;
10559
10560
0
          case PSO_OPTMZ:
10561
0
          optim_flags &= ~(p->value);
10562
10563
          /* For backward compatibility the three original VERBs to disable
10564
          optimizations need to also update the corresponding bit in the
10565
          external options. */
10566
10567
0
          switch(p->value)
10568
0
            {
10569
0
            case PCRE2_OPTIM_AUTO_POSSESS:
10570
0
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10571
0
            break;
10572
10573
0
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10574
0
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10575
0
            break;
10576
10577
0
            case PCRE2_OPTIM_START_OPTIMIZE:
10578
0
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10579
0
            break;
10580
0
            }
10581
10582
0
          break;
10583
10584
          /* LCOV_EXCL_START */
10585
0
          default:
10586
          /* All values in the enum need an explicit entry for this switch
10587
          but until a better way to prevent coding mistakes is invented keep
10588
          a catch all that triggers a debug build assert as a failsafe */
10589
0
          PCRE2_DEBUG_UNREACHABLE();
10590
          /* LCOV_EXCL_STOP */
10591
3.70k
          }
10592
3.70k
        break;   /* Out of the table scan loop */
10593
3.70k
        }
10594
80.8k
      }
10595
4.80k
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10596
4.80k
    }
10597
87.5k
    PCRE2_ASSERT(skipatstart <= patlen);
10598
87.5k
  }
10599
10600
/* End of pattern-start options; advance to start of real regex. */
10601
10602
87.5k
ptr += skipatstart;
10603
10604
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10605
10606
#ifndef SUPPORT_UNICODE
10607
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10608
  {
10609
  errorcode = ERR32;
10610
  goto HAD_EARLY_ERROR;
10611
  }
10612
#endif
10613
10614
/* Check UTF. We have the original options in 'options', with that value as
10615
modified by (*UTF) etc in cb->external_options. The extra option
10616
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10617
surrogate code points cannot be represented in UTF-16. */
10618
10619
87.5k
utf = (cb.external_options & PCRE2_UTF) != 0;
10620
87.5k
if (utf)
10621
23.8k
  {
10622
23.8k
  if ((options & PCRE2_NEVER_UTF) != 0)
10623
0
    {
10624
0
    errorcode = ERR74;
10625
0
    goto HAD_EARLY_ERROR;
10626
0
    }
10627
23.8k
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10628
23.8k
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10629
2.94k
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10630
10631
#if PCRE2_CODE_UNIT_WIDTH == 16
10632
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10633
    {
10634
    errorcode = ERR91;
10635
    goto HAD_EARLY_ERROR;
10636
    }
10637
#endif
10638
23.8k
  }
10639
10640
/* Check UCP lockout. */
10641
10642
84.5k
ucp = (cb.external_options & PCRE2_UCP) != 0;
10643
84.5k
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10644
0
  {
10645
0
  errorcode = ERR75;
10646
0
  goto HAD_EARLY_ERROR;
10647
0
  }
10648
10649
/* PCRE2_EXTRA_TURKISH_CASING checks */
10650
10651
84.5k
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10652
0
  {
10653
0
  if (!utf && !ucp)
10654
0
    {
10655
0
    errorcode = ERR104;
10656
0
    goto HAD_EARLY_ERROR;
10657
0
    }
10658
10659
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10660
0
  if (!utf)
10661
0
    {
10662
0
    errorcode = ERR105;
10663
0
    goto HAD_EARLY_ERROR;
10664
0
    }
10665
0
#endif
10666
10667
0
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10668
0
    {
10669
0
    errorcode = ERR106;
10670
0
    goto HAD_EARLY_ERROR;
10671
0
    }
10672
0
  }
10673
10674
/* Process the BSR setting. */
10675
10676
84.5k
if (bsr == 0) bsr = ccontext->bsr_convention;
10677
10678
/* Process the newline setting. */
10679
10680
84.5k
if (newline == 0) newline = ccontext->newline_convention;
10681
84.5k
cb.nltype = NLTYPE_FIXED;
10682
84.5k
switch(newline)
10683
84.5k
  {
10684
22
  case PCRE2_NEWLINE_CR:
10685
22
  cb.nllen = 1;
10686
22
  cb.nl[0] = CHAR_CR;
10687
22
  break;
10688
10689
81.9k
  case PCRE2_NEWLINE_LF:
10690
81.9k
  cb.nllen = 1;
10691
81.9k
  cb.nl[0] = CHAR_NL;
10692
81.9k
  break;
10693
10694
0
  case PCRE2_NEWLINE_NUL:
10695
0
  cb.nllen = 1;
10696
0
  cb.nl[0] = CHAR_NUL;
10697
0
  break;
10698
10699
1.15k
  case PCRE2_NEWLINE_CRLF:
10700
1.15k
  cb.nllen = 2;
10701
1.15k
  cb.nl[0] = CHAR_CR;
10702
1.15k
  cb.nl[1] = CHAR_NL;
10703
1.15k
  break;
10704
10705
1.06k
  case PCRE2_NEWLINE_ANY:
10706
1.06k
  cb.nltype = NLTYPE_ANY;
10707
1.06k
  break;
10708
10709
369
  case PCRE2_NEWLINE_ANYCRLF:
10710
369
  cb.nltype = NLTYPE_ANYCRLF;
10711
369
  break;
10712
10713
  /* LCOV_EXCL_START */
10714
0
  default:
10715
0
  PCRE2_DEBUG_UNREACHABLE();
10716
0
  errorcode = ERR56;
10717
0
  goto HAD_EARLY_ERROR;
10718
  /* LCOV_EXCL_STOP */
10719
84.5k
  }
10720
10721
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10722
their numerical equivalents, so that this information is always available for
10723
the remaining processing. (2) At the same time, parse the pattern and put a
10724
processed version into the parsed_pattern vector. This has escapes interpreted
10725
and comments removed (amongst other things). */
10726
10727
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10728
patterns the vector on the stack (which was set up above) can be used. */
10729
10730
84.5k
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10731
10732
/* Allow for 2x uint32_t at the start and 2 at the end, for
10733
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10734
10735
84.5k
if ((ccontext->extra_options &
10736
84.5k
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10737
0
  parsed_size_needed += 4;
10738
10739
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10740
10741
84.5k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10742
16.8k
  parsed_size_needed += 4;
10743
10744
84.5k
parsed_size_needed += 1;  /* For the final META_END */
10745
10746
84.5k
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10747
2.37k
  {
10748
2.37k
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10749
2.37k
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10750
2.37k
  if (heap_parsed_pattern == NULL)
10751
0
    {
10752
0
    *errorptr = ERR21;
10753
0
    goto EXIT;
10754
0
    }
10755
2.37k
  cb.parsed_pattern = heap_parsed_pattern;
10756
2.37k
  }
10757
84.5k
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10758
10759
/* Do the parsing scan. */
10760
10761
84.5k
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10762
84.5k
if (errorcode != 0) goto HAD_CB_ERROR;
10763
10764
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10765
lengths. Workspace is needed to remember whether numbered groups are or are not
10766
of limited length, and if limited, what the minimum and maximum lengths are.
10767
This caching saves re-computing the length of any group that is referenced more
10768
than once, which is particularly relevant when recursion is involved.
10769
Unnumbered groups do not have this exposure because they cannot be referenced.
10770
If there are sufficiently few groups, the default index vector on the stack, as
10771
set up above, can be used. Otherwise we have to get/free some heap memory. The
10772
vector must be initialized to zero. */
10773
10774
78.0k
if (has_lookbehind)
10775
6.24k
  {
10776
6.24k
  int loopcount = 0;
10777
6.24k
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10778
65
    {
10779
65
    cb.groupinfo = ccontext->memctl.malloc(
10780
65
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10781
65
    if (cb.groupinfo == NULL)
10782
0
      {
10783
0
      errorcode = ERR21;
10784
0
      cb.erroroffset = 0;
10785
0
      goto HAD_CB_ERROR;
10786
0
      }
10787
65
    }
10788
6.24k
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10789
6.24k
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10790
6.24k
  if (errorcode != 0) goto HAD_CB_ERROR;
10791
6.24k
  }
10792
10793
/* For debugging, there is a function that shows the parsed pattern vector. */
10794
10795
#ifdef DEBUG_SHOW_PARSED
10796
fprintf(stderr, "+++ Pre-scan complete:\n");
10797
show_parsed(&cb);
10798
#endif
10799
10800
/* For debugging capturing information this code can be enabled. */
10801
10802
#ifdef DEBUG_SHOW_CAPTURES
10803
  {
10804
  named_group *ng = cb.named_groups;
10805
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10806
  for (i = 0; i < cb.names_found; i++, ng++)
10807
    {
10808
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10809
    }
10810
  }
10811
#endif
10812
10813
/* Pretend to compile the pattern while actually just accumulating the amount
10814
of memory required in the 'length' variable. This behaviour is triggered by
10815
passing a non-NULL final argument to compile_regex(). We pass a block of
10816
workspace (cworkspace) for it to compile parts of the pattern into; the
10817
compiled code is discarded when it is no longer needed, so hopefully this
10818
workspace will never overflow, though there is a test for its doing so.
10819
10820
On error, errorcode will be set non-zero, so we don't need to look at the
10821
result of the function. The initial options have been put into the cb block,
10822
but we still have to pass a separate options variable (the first argument)
10823
because the options may change as the pattern is processed. */
10824
10825
77.5k
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10826
77.5k
pptr = cb.parsed_pattern;
10827
77.5k
code = cworkspace;
10828
77.5k
*code = OP_BRA;
10829
10830
77.5k
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10831
77.5k
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10832
77.5k
   &cb, &length);
10833
10834
77.5k
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10835
10836
/* This should be caught in compile_regex(), but just in case... */
10837
10838
75.9k
#if defined SUPPORT_WIDE_CHARS
10839
75.9k
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10840
75.9k
if (length > MAX_PATTERN_SIZE ||
10841
75.9k
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10842
#else
10843
if (length > MAX_PATTERN_SIZE)
10844
#endif
10845
1
  {
10846
1
  errorcode = ERR20;
10847
1
  cb.erroroffset = 0;
10848
1
  goto HAD_CB_ERROR;
10849
1
  }
10850
10851
/* Compute the size of, then, if not too large, get and initialize the data
10852
block for storing the compiled pattern and names table. Integer overflow should
10853
no longer be possible because nowadays we limit the maximum value of
10854
cb.names_found and cb.name_entry_size. */
10855
10856
75.9k
re_blocksize =
10857
75.9k
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10858
10859
75.9k
#if defined SUPPORT_WIDE_CHARS
10860
75.9k
if (cb.char_lists_size != 0)
10861
751
  {
10862
751
#if PCRE2_CODE_UNIT_WIDTH != 32
10863
  /* Align to 32 bit first. This ensures the
10864
  allocated area will also be 32 bit aligned. */
10865
751
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10866
751
#endif
10867
751
  re_blocksize += cb.char_lists_size;
10868
751
  }
10869
75.9k
#endif
10870
10871
75.9k
re_blocksize += CU2BYTES(length);
10872
10873
75.9k
if (re_blocksize > ccontext->max_pattern_compiled_length)
10874
56
  {
10875
56
  errorcode = ERR101;
10876
56
  cb.erroroffset = 0;
10877
56
  goto HAD_CB_ERROR;
10878
56
  }
10879
10880
75.8k
re_blocksize += sizeof(pcre2_real_code);
10881
75.8k
re = (pcre2_real_code *)
10882
75.8k
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10883
75.8k
if (re == NULL)
10884
0
  {
10885
0
  errorcode = ERR21;
10886
0
  cb.erroroffset = 0;
10887
0
  goto HAD_CB_ERROR;
10888
0
  }
10889
10890
/* The compiler may put padding at the end of the pcre2_real_code structure in
10891
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10892
compiled pattern is copied (for example, when serialized) undefined bytes are
10893
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10894
write to the last 8 bytes of the structure before setting the fields. */
10895
10896
75.8k
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10897
75.8k
re->memctl = ccontext->memctl;
10898
75.8k
re->tables = tables;
10899
75.8k
re->executable_jit = NULL;
10900
75.8k
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10901
75.8k
re->blocksize = re_blocksize;
10902
75.8k
re->code_start = re_blocksize - CU2BYTES(length);
10903
75.8k
re->magic_number = MAGIC_NUMBER;
10904
75.8k
re->compile_options = options;
10905
75.8k
re->overall_options = cb.external_options;
10906
75.8k
re->extra_options = xoptions;
10907
75.8k
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10908
75.8k
re->limit_heap = limit_heap;
10909
75.8k
re->limit_match = limit_match;
10910
75.8k
re->limit_depth = limit_depth;
10911
75.8k
re->first_codeunit = 0;
10912
75.8k
re->last_codeunit = 0;
10913
75.8k
re->bsr_convention = bsr;
10914
75.8k
re->newline_convention = newline;
10915
75.8k
re->max_lookbehind = 0;
10916
75.8k
re->minlength = 0;
10917
75.8k
re->top_bracket = 0;
10918
75.8k
re->top_backref = 0;
10919
75.8k
re->name_entry_size = cb.name_entry_size;
10920
75.8k
re->name_count = cb.names_found;
10921
75.8k
re->optimization_flags = optim_flags;
10922
10923
/* The basic block is immediately followed by the name table, and the compiled
10924
code follows after that. */
10925
10926
75.8k
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10927
10928
/* Update the compile data block for the actual compile. The starting points of
10929
the name/number translation table and of the code are passed around in the
10930
compile data block. The start/end pattern and initial options are already set
10931
from the pre-compile phase, as is the name_entry_size field. */
10932
10933
75.8k
cb.parens_depth = 0;
10934
75.8k
cb.assert_depth = 0;
10935
75.8k
cb.lastcapture = 0;
10936
75.8k
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10937
75.8k
cb.start_code = codestart;
10938
75.8k
cb.req_varyopt = 0;
10939
75.8k
cb.had_accept = FALSE;
10940
75.8k
cb.had_pruneorskip = FALSE;
10941
75.8k
#ifdef SUPPORT_WIDE_CHARS
10942
75.8k
cb.char_lists_size = 0;
10943
75.8k
#endif
10944
10945
10946
/* If any named groups were found, create the name/number table from the list
10947
created in the pre-pass. */
10948
10949
75.8k
if (cb.names_found > 0)
10950
1.39k
  {
10951
1.39k
  named_group *ng = cb.named_groups;
10952
1.39k
  uint32_t tablecount = 0;
10953
10954
  /* Length 0 represents duplicates, and they have already been handled. */
10955
11.0k
  for (i = 0; i < cb.names_found; i++, ng++)
10956
9.62k
    if (ng->length > 0)
10957
2.49k
      tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);
10958
10959
1.39k
  PCRE2_ASSERT(tablecount == cb.names_found);
10960
1.39k
  }
10961
10962
/* Set up a starting, non-extracting bracket, then compile the expression. On
10963
error, errorcode will be set non-zero, so we don't need to look at the result
10964
of the function here. */
10965
10966
75.8k
pptr = cb.parsed_pattern;
10967
75.8k
code = (PCRE2_UCHAR *)codestart;
10968
75.8k
*code = OP_BRA;
10969
75.8k
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10970
75.8k
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10971
75.8k
  NULL, &cb, NULL);
10972
75.8k
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10973
75.8k
re->top_bracket = cb.bracount;
10974
75.8k
re->top_backref = cb.top_backref;
10975
75.8k
re->max_lookbehind = cb.max_lookbehind;
10976
10977
75.8k
if (cb.had_accept)
10978
2.01k
  {
10979
2.01k
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10980
2.01k
  reqcuflags = REQ_NONE;
10981
2.01k
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10982
2.01k
  }
10983
10984
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10985
but the estimated length exceeds the really used length, adjust the value of
10986
re->blocksize, and if valgrind support is configured, mark the extra allocated
10987
memory as unaddressable, so that any out-of-bound reads can be detected. */
10988
10989
75.8k
*code++ = OP_END;
10990
75.8k
usedlength = code - codestart;
10991
/* LCOV_EXCL_START */
10992
75.8k
if (usedlength > length)
10993
0
  {
10994
0
  PCRE2_DEBUG_UNREACHABLE();
10995
0
  errorcode = ERR23;  /* Overflow of code block - internal error */
10996
0
  cb.erroroffset = 0;
10997
0
  goto HAD_CB_ERROR;
10998
0
  }
10999
/* LCOV_EXCL_STOP */
11000
11001
75.8k
re->blocksize -= CU2BYTES(length - usedlength);
11002
#ifdef SUPPORT_VALGRIND
11003
VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
11004
#endif
11005
11006
/* Scan the pattern for recursion/subroutine calls and convert the group
11007
numbers into offsets. Maintain a small cache so that repeated groups containing
11008
recursions are efficiently handled. */
11009
11010
75.8k
#define RSCAN_CACHE_SIZE 8
11011
11012
75.8k
if (errorcode == 0 && cb.had_recurse)
11013
7.05k
  {
11014
7.05k
  PCRE2_UCHAR *rcode;
11015
7.05k
  PCRE2_SPTR rgroup;
11016
7.05k
  unsigned int ccount = 0;
11017
7.05k
  int start = RSCAN_CACHE_SIZE;
11018
7.05k
  recurse_cache rc[RSCAN_CACHE_SIZE];
11019
11020
7.05k
  for (rcode = find_recurse(codestart, utf);
11021
1.30M
       rcode != NULL;
11022
1.29M
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
11023
1.29M
    {
11024
1.29M
    int p, groupnumber;
11025
11026
1.29M
    groupnumber = (int)GET(rcode, 1);
11027
1.29M
    if (groupnumber == 0) rgroup = codestart; else
11028
938k
      {
11029
938k
      PCRE2_SPTR search_from = codestart;
11030
938k
      rgroup = NULL;
11031
1.29M
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
11032
1.24M
        {
11033
1.24M
        if (groupnumber == rc[p].groupnumber)
11034
893k
          {
11035
893k
          rgroup = rc[p].group;
11036
893k
          break;
11037
893k
          }
11038
11039
        /* Group n+1 must always start to the right of group n, so we can save
11040
        search time below when the new group number is greater than any of the
11041
        previously found groups. */
11042
11043
351k
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
11044
351k
        }
11045
11046
938k
      if (rgroup == NULL)
11047
44.8k
        {
11048
44.8k
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
11049
        /* LCOV_EXCL_START */
11050
44.8k
        if (rgroup == NULL)
11051
0
          {
11052
0
          PCRE2_DEBUG_UNREACHABLE();
11053
0
          errorcode = ERR53;
11054
0
          break;
11055
0
          }
11056
        /* LCOV_EXCL_STOP */
11057
11058
44.8k
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
11059
44.8k
        rc[start].groupnumber = groupnumber;
11060
44.8k
        rc[start].group = rgroup;
11061
44.8k
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
11062
44.8k
        }
11063
938k
      }
11064
11065
1.29M
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
11066
1.29M
    }
11067
7.05k
  }
11068
11069
/* In rare debugging situations we sometimes need to look at the compiled code
11070
at this stage. */
11071
11072
#ifdef DEBUG_CALL_PRINTINT
11073
pcre2_printint(re, stderr, TRUE);
11074
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
11075
#endif
11076
11077
/* Unless disabled, check whether any single character iterators can be
11078
auto-possessified. The function overwrites the appropriate opcode values, so
11079
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
11080
used in this code because at least one compiler gives a warning about loss of
11081
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
11082
function call. */
11083
11084
75.8k
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
11085
58.7k
  {
11086
58.7k
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
11087
58.7k
  int possessify_rc = PRIV(auto_possessify)(temp, &cb);
11088
  /* LCOV_EXCL_START */
11089
58.7k
  if (possessify_rc != 0)
11090
0
    {
11091
0
    PCRE2_DEBUG_UNREACHABLE();
11092
0
    errorcode = ERR80;
11093
0
    cb.erroroffset = 0;
11094
0
    }
11095
  /* LCOV_EXCL_STOP */
11096
58.7k
  }
11097
11098
/* Failed to compile, or error while post-processing. */
11099
11100
75.8k
if (errorcode != 0) goto HAD_CB_ERROR;
11101
11102
/* Successful compile. If the anchored option was not passed, set it if
11103
we can determine that the pattern is anchored by virtue of ^ characters or \A
11104
or anything else, such as starting with non-atomic .* when DOTALL is set and
11105
there are no occurrences of *PRUNE or *SKIP (though there is an option to
11106
disable this case). */
11107
11108
75.7k
if ((re->overall_options & PCRE2_ANCHORED) == 0)
11109
71.5k
  {
11110
71.5k
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11111
71.5k
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11112
563
    re->overall_options |= PCRE2_ANCHORED;
11113
71.5k
  }
11114
11115
/* Set up the first code unit or startline flag, the required code unit, and
11116
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
11117
is disabled, as the data it would create will not be used. Note that a first code
11118
unit (but not the startline flag) is useful for anchored patterns because it
11119
can still give a quick "no match" and also avoid searching for a last code
11120
unit. */
11121
11122
75.7k
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
11123
60.7k
  {
11124
60.7k
  int minminlength = 0;  /* For minimal minlength from first/required CU */
11125
60.7k
  int study_rc;
11126
11127
  /* If we do not have a first code unit, see if there is one that is asserted
11128
  (these are not saved during the compile because they can cause conflicts with
11129
  actual literals that follow). */
11130
11131
60.7k
  if (firstcuflags >= REQ_NONE) {
11132
40.3k
    uint32_t assertedcuflags = 0;
11133
40.3k
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
11134
    /* It would be wrong to use the asserted first code unit as `firstcu` for
11135
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
11136
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
11137
     * the subject string needs to be at least 2 characters long, which is wrong.
11138
     * With more analysis, we would be able to set firstcu in more cases. */
11139
40.3k
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
11140
251
      firstcu = assertedcu;
11141
251
      firstcuflags = assertedcuflags;
11142
251
    }
11143
40.3k
  }
11144
11145
  /* Save the data for a first code unit. The existence of one means the
11146
  minimum length must be at least 1. */
11147
11148
60.7k
  if (firstcuflags < REQ_NONE)
11149
20.6k
    {
11150
20.6k
    re->first_codeunit = firstcu;
11151
20.6k
    re->flags |= PCRE2_FIRSTSET;
11152
20.6k
    minminlength++;
11153
11154
    /* Handle caseless first code units. */
11155
11156
20.6k
    if ((firstcuflags & REQ_CASELESS) != 0)
11157
3.86k
      {
11158
3.86k
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
11159
3.46k
        {
11160
3.46k
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
11161
3.46k
        }
11162
11163
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
11164
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
11165
      units and cannot have another case, but if UCP is set they may do. */
11166
11167
404
#ifdef SUPPORT_UNICODE
11168
404
#if PCRE2_CODE_UNIT_WIDTH == 8
11169
404
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
11170
158
        re->flags |= PCRE2_FIRSTCASELESS;
11171
#else
11172
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
11173
               UCD_OTHERCASE(firstcu) != firstcu)
11174
        re->flags |= PCRE2_FIRSTCASELESS;
11175
#endif
11176
3.86k
#endif  /* SUPPORT_UNICODE */
11177
3.86k
      }
11178
20.6k
    }
11179
11180
  /* When there is no first code unit, for non-anchored patterns, see if we can
11181
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
11182
  branches start with ^ and also when all branches start with non-atomic .* for
11183
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
11184
  that disables this case.) */
11185
11186
40.0k
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
11187
38.6k
    {
11188
38.6k
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11189
38.6k
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11190
1.11k
      re->flags |= PCRE2_STARTLINE;
11191
38.6k
    }
11192
11193
  /* Handle the "required code unit", if one is set. In the UTF case we can
11194
  increment the minimum minimum length only if we are sure this really is a
11195
  different character and not a non-starting code unit of the first character,
11196
  because the minimum length count is in characters, not code units. */
11197
11198
60.7k
  if (reqcuflags < REQ_NONE)
11199
33.4k
    {
11200
#if PCRE2_CODE_UNIT_WIDTH == 16
11201
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11202
        firstcuflags >= REQ_NONE ||                 /* First not set */
11203
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
11204
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
11205
#elif PCRE2_CODE_UNIT_WIDTH == 8
11206
33.4k
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11207
5.03k
        firstcuflags >= REQ_NONE ||                 /* First not set */
11208
1.67k
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
11209
92
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
11210
33.3k
#endif
11211
33.3k
      {
11212
33.3k
      minminlength++;
11213
33.3k
      }
11214
11215
    /* In the case of an anchored pattern, set up the value only if it follows
11216
    a variable length item in the pattern. */
11217
11218
33.4k
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
11219
1.24k
        (reqcuflags & REQ_VARY) != 0)
11220
32.9k
      {
11221
32.9k
      re->last_codeunit = reqcu;
11222
32.9k
      re->flags |= PCRE2_LASTSET;
11223
11224
      /* Handle caseless required code units as for first code units (above). */
11225
11226
32.9k
      if ((reqcuflags & REQ_CASELESS) != 0)
11227
5.98k
        {
11228
5.98k
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
11229
5.58k
          {
11230
5.58k
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11231
5.58k
          }
11232
405
#ifdef SUPPORT_UNICODE
11233
405
#if PCRE2_CODE_UNIT_WIDTH == 8
11234
405
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11235
184
        re->flags |= PCRE2_LASTCASELESS;
11236
#else
11237
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11238
               UCD_OTHERCASE(reqcu) != reqcu)
11239
        re->flags |= PCRE2_LASTCASELESS;
11240
#endif
11241
5.98k
#endif  /* SUPPORT_UNICODE */
11242
5.98k
        }
11243
32.9k
      }
11244
33.4k
    }
11245
11246
  /* Study the compiled pattern to set up information such as a bitmap of
11247
  starting code units and a minimum matching length. */
11248
11249
60.7k
  study_rc = PRIV(study)(re);
11250
  /* LCOV_EXCL_START */
11251
60.7k
  if (study_rc != 0)
11252
0
    {
11253
0
    PCRE2_DEBUG_UNREACHABLE();
11254
0
    errorcode = ERR31;
11255
0
    cb.erroroffset = 0;
11256
0
    goto HAD_CB_ERROR;
11257
0
    }
11258
  /* LCOV_EXCL_STOP */
11259
11260
  /* If study() set a bitmap of starting code units, it implies a minimum
11261
  length of at least one. */
11262
11263
60.7k
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11264
5.53k
    minminlength = 1;
11265
11266
  /* If the minimum length set (or not set) by study() is less than the minimum
11267
  implied by required code units, override it. */
11268
11269
60.7k
  if (re->minlength < minminlength) re->minlength = minminlength;
11270
60.7k
  }   /* End of start-of-match optimizations. */
11271
11272
/* Control ends up here in all cases. When running under valgrind, make a
11273
pattern's terminating zero defined again. If memory was obtained for the parsed
11274
version of the pattern, free it before returning. Also free the list of named
11275
groups if a larger one had to be obtained, and likewise the group information
11276
vector. */
11277
11278
75.7k
#ifdef SUPPORT_UNICODE
11279
/* All items must be freed. */
11280
75.7k
PCRE2_ASSERT(cb.first_data == NULL);
11281
75.7k
#endif
11282
11283
87.5k
EXIT:
11284
#ifdef SUPPORT_VALGRIND
11285
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11286
#endif
11287
87.5k
if (cb.parsed_pattern != stack_parsed_pattern)
11288
2.37k
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11289
87.5k
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11290
129
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11291
87.5k
if (cb.groupinfo != stack_groupinfo)
11292
65
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11293
11294
87.5k
return re;    /* Will be NULL after an error */
11295
11296
/* Errors discovered in parse_regex() set the offset value in the compile
11297
block. Errors discovered before it is called must compute it from the ptr
11298
value. After parse_regex() is called, the offset in the compile block is set to
11299
the end of the pattern, but certain errors in compile_regex() may reset it if
11300
an offset is available in the parsed pattern. */
11301
11302
8.77k
HAD_CB_ERROR:
11303
8.77k
ptr = pattern + cb.erroroffset;
11304
11305
8.77k
HAD_EARLY_ERROR:
11306
/* Ensure we don't return out-of-range erroroffset. */
11307
8.77k
PCRE2_ASSERT(ptr >= pattern);
11308
8.77k
PCRE2_ASSERT(ptr <= (pattern + patlen));
11309
/* Ensure that the erroroffset never slices a UTF-encoded character in half.
11310
If the input is invalid, then we return an offset just before the first invalid
11311
character, so the text to the left of the offset must always be valid. */
11312
#if defined PCRE2_DEBUG && defined SUPPORT_UNICODE
11313
if (ptr > pattern && utf)
11314
  {
11315
  PCRE2_SPTR prev = ptr - 1;
11316
  PCRE2_SIZE dummyoffset;
11317
  BACKCHAR(prev);
11318
  PCRE2_ASSERT(prev >= pattern);
11319
  PCRE2_ASSERT(PRIV(valid_utf)(prev, ptr - prev, &dummyoffset) == 0);
11320
  }
11321
#endif
11322
8.77k
*erroroffset = ptr - pattern;
11323
11324
11.7k
HAD_ERROR:
11325
11.7k
*errorptr = errorcode;
11326
11.7k
pcre2_code_free(re);
11327
11.7k
re = NULL;
11328
11329
11.7k
if (cb.first_data != NULL)
11330
92
  {
11331
92
  compile_data* current_data = cb.first_data;
11332
92
  do
11333
561
    {
11334
561
    compile_data* next_data = current_data->next;
11335
561
    cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);
11336
561
    current_data = next_data;
11337
561
    }
11338
561
  while (current_data != NULL);
11339
92
  }
11340
11341
11.7k
goto EXIT;
11342
8.77k
}
11343
11344
/* These #undefs are here to enable unity builds with CMake. */
11345
11346
#undef NLBLOCK /* Block containing newline information */
11347
#undef PSSTART /* Field containing processed string start */
11348
#undef PSEND   /* Field containing processed string end */
11349
11350
/* End of pcre2_compile.c */