Coverage Report

Created: 2026-02-14 07:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/pcre2/src/pcre2_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
166k
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
41.3k
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
/* In rare error cases debugging might require calling pcre2_printint(). */
51
52
#if 0
53
#ifdef EBCDIC
54
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
55
#else
56
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
57
#endif
58
#define CHAR_OUTPUT(c)      (c)
59
#define CHAR_OUTPUT_HEX(c)  (c)
60
#define CHAR_INPUT(c)       (c)
61
#define CHAR_INPUT_HEX(c)   (c)
62
#include "pcre2_printint_inc.h"
63
#undef PRINTABLE
64
#undef CHAR_OUTPUT
65
#undef CHAR_OUTPUT_HEX
66
#undef CHAR_INPUT
67
#define DEBUG_CALL_PRINTINT
68
#endif
69
70
/* Other debugging code can be enabled by these defines. */
71
72
/* #define DEBUG_SHOW_CAPTURES */
73
/* #define DEBUG_SHOW_PARSED */
74
75
/* There are a few things that vary with different code unit sizes. Handle them
76
by defining macros in order to minimize #if usage. */
77
78
#if PCRE2_CODE_UNIT_WIDTH == 8
79
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
80
#define XDIGIT(c)                xdigitab[c]
81
82
#else  /* Either 16-bit or 32-bit */
83
17.2k
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
84
85
#if PCRE2_CODE_UNIT_WIDTH == 16
86
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
87
88
#else  /* 32-bit */
89
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
90
#endif
91
#endif
92
93
/* Function definitions to allow mutual recursion */
94
95
static int
96
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
97
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
98
    open_capitem *, compile_block *, PCRE2_SIZE *);
99
100
static int
101
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
102
    compile_block *);
103
104
static BOOL
105
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
106
    compile_block *);
107
108
static int
109
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
110
    compile_block *, int *);
111
112
113
/*************************************************
114
*      Code parameters and static tables         *
115
*************************************************/
116
117
195k
#define MAX_GROUP_NUMBER   65535u
118
1.57M
#define MAX_REPEAT_COUNT   65535u
119
1.42M
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
120
121
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
122
different ways in the different pattern scans. The parsing and group-
123
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
124
aligned for this. Having defined the size in code units, we set up
125
C16_WORK_SIZE as the number of elements in the 16-bit vector.
126
127
During the first compiling phase, when determining how much memory is required,
128
the regex is partly compiled into this space, but the compiled parts are
129
discarded as soon as they can be, so that hopefully there will never be an
130
overrun. The code does, however, check for an overrun, which can occur for
131
pathological patterns. The size of the workspace depends on LINK_SIZE because
132
the length of compiled items varies with this.
133
134
In the real compile phase, this workspace is not currently used. */
135
136
82.9k
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
137
138
#define C16_WORK_SIZE \
139
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
140
141
/* A uint32_t vector is used for caching information about the size of
142
capturing groups, to improve performance. A default is created on the stack of
143
this size. */
144
145
5.60k
#define GROUPINFO_DEFAULT_SIZE 256
146
147
/* The overrun tests check for a slightly smaller size so that they detect the
148
overrun before it actually does run off the end of the data block. */
149
150
6.40M
#define WORK_SIZE_SAFETY_MARGIN (100)
151
152
/* This value determines the size of the initial vector that is used for
153
remembering named groups during the pre-compile. It is allocated on the stack,
154
but if it is too small, it is expanded, in a similar way to the workspace. The
155
value is the number of slots in the list. */
156
157
166k
#define NAMED_GROUP_LIST_SIZE  20
158
159
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
160
of uint32_t. For short patterns this lives on the stack, with this size. Heap
161
memory is used for longer patterns. */
162
163
82.1k
#define PARSED_PATTERN_DEFAULT_SIZE 1024
164
165
/* Maximum length value to check against when making sure that the variable
166
that holds the compiled pattern length does not overflow. We make it a bit less
167
than INT_MAX to allow for adding in group terminating code units, so that we
168
don't have to check them every time. */
169
170
6.52M
#define OFLOW_MAX (INT_MAX - 20)
171
172
/* Table of extra lengths for each of the meta codes. Must be kept in step with
173
the definitions above. For some items these values are a basic length to which
174
a variable amount has to be added. */
175
176
static unsigned char meta_extra_lengths[] = {
177
  0,             /* META_END */
178
  0,             /* META_ALT */
179
  0,             /* META_ATOMIC */
180
  0,             /* META_BACKREF - more if group is >= 10 */
181
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
182
  1,             /* META_BIGVALUE */
183
  3,             /* META_CALLOUT_NUMBER */
184
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
185
  0,             /* META_CAPTURE */
186
  0,             /* META_CIRCUMFLEX */
187
  0,             /* META_CLASS */
188
  0,             /* META_CLASS_EMPTY */
189
  0,             /* META_CLASS_EMPTY_NOT */
190
  0,             /* META_CLASS_END */
191
  0,             /* META_CLASS_NOT */
192
  0,             /* META_COND_ASSERT */
193
  SIZEOFFSET,    /* META_COND_DEFINE */
194
  1+SIZEOFFSET,  /* META_COND_NAME */
195
  1+SIZEOFFSET,  /* META_COND_NUMBER */
196
  1+SIZEOFFSET,  /* META_COND_RNAME */
197
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
198
  3,             /* META_COND_VERSION */
199
  SIZEOFFSET,    /* META_OFFSET */
200
  0,             /* META_SCS */
201
  1,             /* META_CAPTURE_NAME */
202
  1,             /* META_CAPTURE_NUMBER */
203
  0,             /* META_DOLLAR */
204
  0,             /* META_DOT */
205
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
206
  0,             /* META_KET */
207
  0,             /* META_NOCAPTURE */
208
  2,             /* META_OPTIONS */
209
  1,             /* META_POSIX */
210
  1,             /* META_POSIX_NEG */
211
  0,             /* META_RANGE_ESCAPED */
212
  0,             /* META_RANGE_LITERAL */
213
  SIZEOFFSET,    /* META_RECURSE */
214
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
215
  0,             /* META_SCRIPT_RUN */
216
  0,             /* META_LOOKAHEAD */
217
  0,             /* META_LOOKAHEADNOT */
218
  SIZEOFFSET,    /* META_LOOKBEHIND */
219
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
220
  0,             /* META_LOOKAHEAD_NA */
221
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
222
  1,             /* META_MARK - plus the string length */
223
  0,             /* META_ACCEPT */
224
  0,             /* META_FAIL */
225
  0,             /* META_COMMIT */
226
  1,             /* META_COMMIT_ARG - plus the string length */
227
  0,             /* META_PRUNE */
228
  1,             /* META_PRUNE_ARG - plus the string length */
229
  0,             /* META_SKIP */
230
  1,             /* META_SKIP_ARG - plus the string length */
231
  0,             /* META_THEN */
232
  1,             /* META_THEN_ARG - plus the string length */
233
  0,             /* META_ASTERISK */
234
  0,             /* META_ASTERISK_PLUS */
235
  0,             /* META_ASTERISK_QUERY */
236
  0,             /* META_PLUS */
237
  0,             /* META_PLUS_PLUS */
238
  0,             /* META_PLUS_QUERY */
239
  0,             /* META_QUERY */
240
  0,             /* META_QUERY_PLUS */
241
  0,             /* META_QUERY_QUERY */
242
  2,             /* META_MINMAX */
243
  2,             /* META_MINMAX_PLUS */
244
  2,             /* META_MINMAX_QUERY */
245
  0,             /* META_ECLASS_AND */
246
  0,             /* META_ECLASS_OR */
247
  0,             /* META_ECLASS_SUB */
248
  0,             /* META_ECLASS_XOR */
249
  0              /* META_ECLASS_NOT */
250
};
251
252
/* Types for skipping parts of a parsed pattern. */
253
254
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
255
256
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
257
variables, which are concerned with first and required code units. A value
258
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
259
matching xxcu variable is set, and the low valued bits are relevant. */
260
261
12.0M
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
262
1.94M
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
263
241k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
264
1.36M
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
265
266
/* These flags are used in the groupinfo vector. */
267
268
5.41k
#define GI_SET_FIXED_LENGTH    0x80000000u
269
3.68k
#define GI_NOT_FIXED_LENGTH    0x40000000u
270
1.47k
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
271
272
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
273
and is fast (a good compiler can turn it into a subtraction and unsigned
274
comparison). */
275
276
1.14M
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
277
278
/* Table to identify hex digits. The tables in chartables are dependent on the
279
locale, and may mark arbitrary characters as digits. We want to recognize only
280
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
281
costs 256 bytes, but it is a lot faster than doing character value tests (at
282
least in some simple cases I timed), and in some applications one wants PCRE2
283
to compile efficiently as well as match efficiently. The value in the table is
284
the binary hex digit value, or 0xff for non-hex digits. */
285
286
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
287
UTF-8 mode. */
288
289
#ifndef EBCDIC
290
static const uint8_t xdigitab[] =
291
  {
292
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
293
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
294
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
295
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
296
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
297
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
298
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
299
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
300
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
301
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
302
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
303
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
304
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
305
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
306
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
307
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
308
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
309
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
316
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
317
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
318
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
322
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
324
325
#else
326
327
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
328
329
static const uint8_t xdigitab[] =
330
  {
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
342
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
343
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
344
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
345
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
346
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
347
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
348
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
355
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
361
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
362
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
363
#endif  /* EBCDIC */
364
365
366
/* Table for handling alphanumeric escaped characters. Positive returns are
367
simple data values; negative values are for special things like \d and so on.
368
Zero means further processing is needed (for things like \x), or the escape is
369
invalid. */
370
371
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
372
in UTF-8 mode. It runs from '0' to 'z'. */
373
374
#ifndef EBCDIC
375
498k
#define ESCAPES_FIRST       CHAR_0
376
260k
#define ESCAPES_LAST        CHAR_z
377
201
#define UPPER_CASE(c)       (c-32)
378
379
static const short int escapes[] = {
380
    /* 0 */ 0,                       /* 1 */ 0,
381
    /* 2 */ 0,                       /* 3 */ 0,
382
    /* 4 */ 0,                       /* 5 */ 0,
383
    /* 6 */ 0,                       /* 7 */ 0,
384
    /* 8 */ 0,                       /* 9 */ 0,
385
    /* : */ ESCAPES_FIRST+0x0a,      /* ; */ ESCAPES_FIRST+0x0b,
386
    /* < */ ESCAPES_FIRST+0x0c,      /* = */ ESCAPES_FIRST+0x0d,
387
    /* > */ ESCAPES_FIRST+0x0e,      /* ? */ ESCAPES_FIRST+0x0f,
388
    /* @ */ ESCAPES_FIRST+0x10,      /* A */ -ESC_A,
389
    /* B */ -ESC_B,                  /* C */ -ESC_C,
390
    /* D */ -ESC_D,                  /* E */ -ESC_E,
391
    /* F */ 0,                       /* G */ -ESC_G,
392
    /* H */ -ESC_H,                  /* I */ 0,
393
    /* J */ 0,                       /* K */ -ESC_K,
394
    /* L */ 0,                       /* M */ 0,
395
    /* N */ -ESC_N,                  /* O */ 0,
396
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
397
    /* R */ -ESC_R,                  /* S */ -ESC_S,
398
    /* T */ 0,                       /* U */ 0,
399
    /* V */ -ESC_V,                  /* W */ -ESC_W,
400
    /* X */ -ESC_X,                  /* Y */ 0,
401
    /* Z */ -ESC_Z,                  /* [ */ ESCAPES_FIRST+0x2b,
402
    /* \ */ ESCAPES_FIRST+0x2c,      /* ] */ ESCAPES_FIRST+0x2d,
403
    /* ^ */ ESCAPES_FIRST+0x2e,      /* _ */ ESCAPES_FIRST+0x2f,
404
    /* ` */ ESCAPES_FIRST+0x30,      /* a */ CHAR_BEL,
405
    /* b */ -ESC_b,                  /* c */ 0,
406
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
407
    /* f */ CHAR_FF,                 /* g */ 0,
408
    /* h */ -ESC_h,                  /* i */ 0,
409
    /* j */ 0,                       /* k */ -ESC_k,
410
    /* l */ 0,                       /* m */ 0,
411
    /* n */ CHAR_LF,                 /* o */ 0,
412
    /* p */ -ESC_p,                  /* q */ 0,
413
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
414
    /* t */ CHAR_HT,                 /* u */ 0,
415
    /* v */ -ESC_v,                  /* w */ -ESC_w,
416
    /* x */ 0,                       /* y */ 0,
417
    /* z */ -ESC_z
418
};
419
420
#else
421
422
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
423
It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,
424
which can interpret character literals like 'a' or '[' in an EBCDIC codepage;
425
in this case, there is wide variance between codepages on the interpretation of
426
characters between the letters ('[' and '{' and so on are placed in all sorts of
427
different positions in the table). Thankfully however, all EBCDIC codepages
428
place the letters and digits in the same location, so we hardcode that here.
429
Our EBCDIC support can also be provided via numeric literals instead of
430
character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled
431
in EBCDIC mode. */
432
433
#define ESCAPES_FIRST       CHAR_a
434
#define ESCAPES_LAST        CHAR_9
435
#define UPPER_CASE(c)       (c+64)
436
437
static const short int escapes[] = {
438
    /* 0x81 a */ CHAR_BEL,             /* 0x82 b */ -ESC_b,
439
    /* 0x83 c */ 0,                    /* 0x84 d */ -ESC_d,
440
    /* 0x85 e */ CHAR_ESC,             /* 0x86 f */ CHAR_FF,
441
    /* 0x87 g */ 0,                    /* 0x88 h */ -ESC_h,
442
    /* 0x89 i */ 0,                    /* 0x8a   */ ESCAPES_FIRST+0x09,
443
    /* 0x8b   */ ESCAPES_FIRST+0x0a,   /* 0x8c   */ ESCAPES_FIRST+0x0b,
444
    /* 0x8d   */ ESCAPES_FIRST+0x0c,   /* 0x8e   */ ESCAPES_FIRST+0x0d,
445
    /* 0x8f   */ ESCAPES_FIRST+0x0e,   /* 0x90   */ ESCAPES_FIRST+0x0f,
446
    /* 0x91 j */ 0,                    /* 0x92 k */ -ESC_k,
447
    /* 0x93 l */ 0,                    /* 0x94 m */ 0,
448
    /* 0x95 n */ CHAR_LF,              /* 0x96 o */ 0,
449
    /* 0x97 p */ -ESC_p,               /* 0x98 q */ 0,
450
    /* 0x99 r */ CHAR_CR,              /* 0x9a   */ ESCAPES_FIRST+0x19,
451
    /* 0x9b   */ ESCAPES_FIRST+0x1a,   /* 0x9c   */ ESCAPES_FIRST+0x1b,
452
    /* 0x9d   */ ESCAPES_FIRST+0x1c,   /* 0x9e   */ ESCAPES_FIRST+0x1d,
453
    /* 0x9f   */ ESCAPES_FIRST+0x1e,   /* 0xa0   */ ESCAPES_FIRST+0x1f,
454
    /* 0xa1   */ ESCAPES_FIRST+0x20,   /* 0xa2 s */ -ESC_s,
455
    /* 0xa3 t */ CHAR_HT,              /* 0xa4 u */ 0,
456
    /* 0xa5 v */ -ESC_v,               /* 0xa6 w */ -ESC_w,
457
    /* 0xa7 x */ 0,                    /* 0xa8 y */ 0,
458
    /* 0xa9 z */ -ESC_z,               /* 0xaa   */ ESCAPES_FIRST+0x29,
459
    /* 0xab   */ ESCAPES_FIRST+0x2a,   /* 0xac   */ ESCAPES_FIRST+0x2b,
460
    /* 0xad   */ ESCAPES_FIRST+0x2c,   /* 0xae   */ ESCAPES_FIRST+0x2d,
461
    /* 0xaf   */ ESCAPES_FIRST+0x2e,   /* 0xb0   */ ESCAPES_FIRST+0x2f,
462
    /* 0xb1   */ ESCAPES_FIRST+0x30,   /* 0xb2   */ ESCAPES_FIRST+0x31,
463
    /* 0xb3   */ ESCAPES_FIRST+0x32,   /* 0xb4   */ ESCAPES_FIRST+0x33,
464
    /* 0xb5   */ ESCAPES_FIRST+0x34,   /* 0xb6   */ ESCAPES_FIRST+0x35,
465
    /* 0xb7   */ ESCAPES_FIRST+0x36,   /* 0xb8   */ ESCAPES_FIRST+0x37,
466
    /* 0xb9   */ ESCAPES_FIRST+0x38,   /* 0xba   */ ESCAPES_FIRST+0x39,
467
    /* 0xbb   */ ESCAPES_FIRST+0x3a,   /* 0xbc   */ ESCAPES_FIRST+0x3b,
468
    /* 0xbd   */ ESCAPES_FIRST+0x3c,   /* 0xbe   */ ESCAPES_FIRST+0x3d,
469
    /* 0xbf   */ ESCAPES_FIRST+0x3e,   /* 0xc0   */ ESCAPES_FIRST+0x3f,
470
    /* 0xc1 A */ -ESC_A,               /* 0xc2 B */ -ESC_B,
471
    /* 0xc3 C */ -ESC_C,               /* 0xc4 D */ -ESC_D,
472
    /* 0xc5 E */ -ESC_E,               /* 0xc6 F */ 0,
473
    /* 0xc7 G */ -ESC_G,               /* 0xc8 H */ -ESC_H,
474
    /* 0xc9 I */ 0,                    /* 0xca   */ ESCAPES_FIRST+0x49,
475
    /* 0xcb   */ ESCAPES_FIRST+0x4a,   /* 0xcc   */ ESCAPES_FIRST+0x4b,
476
    /* 0xcd   */ ESCAPES_FIRST+0x4c,   /* 0xce   */ ESCAPES_FIRST+0x4d,
477
    /* 0xcf   */ ESCAPES_FIRST+0x4e,   /* 0xd0   */ ESCAPES_FIRST+0x4f,
478
    /* 0xd1 J */ 0,                    /* 0xd2 K */ -ESC_K,
479
    /* 0xd3 L */ 0,                    /* 0xd4 M */ 0,
480
    /* 0xd5 N */ -ESC_N,               /* 0xd6 O */ 0,
481
    /* 0xd7 P */ -ESC_P,               /* 0xd8 Q */ -ESC_Q,
482
    /* 0xd9 R */ -ESC_R,               /* 0xda   */ ESCAPES_FIRST+0x59,
483
    /* 0xdb   */ ESCAPES_FIRST+0x5a,   /* 0xdc   */ ESCAPES_FIRST+0x5b,
484
    /* 0xdd   */ ESCAPES_FIRST+0x5c,   /* 0xde   */ ESCAPES_FIRST+0x5d,
485
    /* 0xdf   */ ESCAPES_FIRST+0x5e,   /* 0xe0   */ ESCAPES_FIRST+0x5f,
486
    /* 0xe1   */ ESCAPES_FIRST+0x60,   /* 0xe2 S */ -ESC_S,
487
    /* 0xe3 T */ 0,                    /* 0xe4 U */ 0,
488
    /* 0xe5 V */ -ESC_V,               /* 0xe6 W */ -ESC_W,
489
    /* 0xe7 X */ -ESC_X,               /* 0xe8 Y */ 0,
490
    /* 0xe9 Z */ -ESC_Z,               /* 0xea   */ ESCAPES_FIRST+0x69,
491
    /* 0xeb   */ ESCAPES_FIRST+0x6a,   /* 0xec   */ ESCAPES_FIRST+0x6b,
492
    /* 0xed   */ ESCAPES_FIRST+0x6c,   /* 0xee   */ ESCAPES_FIRST+0x6d,
493
    /* 0xef   */ ESCAPES_FIRST+0x6e,   /* 0xf0 0 */ 0,
494
    /* 0xf1 1 */ 0,                    /* 0xf2 2 */ 0,
495
    /* 0xf3 3 */ 0,                    /* 0xf4 4 */ 0,
496
    /* 0xf5 5 */ 0,                    /* 0xf6 6 */ 0,
497
    /* 0xf7 7 */ 0,                    /* 0xf8 8 */ 0,
498
    /* 0xf9 9 */ 0,
499
};
500
501
/* We also need a table of characters that may follow \c in an EBCDIC
502
environment for characters 0-31. */
503
504
static unsigned char ebcdic_escape_c[] = {
505
  CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,
506
  CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,
507
  CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,
508
  CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
509
  CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE
510
};
511
512
#endif   /* EBCDIC */
513
514
515
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
516
searched linearly. Put all the names into a single string, in order to reduce
517
the number of relocations when a shared library is dynamically linked. The
518
string is built from string macros so that it works in UTF-8 mode on EBCDIC
519
platforms. */
520
521
typedef struct verbitem {
522
  unsigned int len;          /* Length of verb name */
523
  uint32_t meta;             /* Base META_ code */
524
  int has_arg;               /* Argument requirement */
525
} verbitem;
526
527
static const char verbnames[] =
528
  "\0"                       /* Empty name is a shorthand for MARK */
529
  STRING_MARK0
530
  STRING_ACCEPT0
531
  STRING_F0
532
  STRING_FAIL0
533
  STRING_COMMIT0
534
  STRING_PRUNE0
535
  STRING_SKIP0
536
  STRING_THEN;
537
538
static const verbitem verbs[] = {
539
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
540
  { 4, META_MARK,   +1 },
541
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
542
  { 1, META_FAIL,   -1 },
543
  { 4, META_FAIL,   -1 },
544
  { 6, META_COMMIT,  0 },
545
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
546
  { 4, META_SKIP,    0 },
547
  { 4, META_THEN,    0 }
548
};
549
550
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
551
552
/* Verb opcodes, indexed by their META code offset from META_MARK. */
553
554
static const uint32_t verbops[] = {
555
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
556
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
557
558
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
559
560
typedef struct alasitem {
561
  unsigned int len;          /* Length of name */
562
  uint32_t meta;             /* Base META_ code */
563
} alasitem;
564
565
static const char alasnames[] =
566
  STRING_pla0
567
  STRING_plb0
568
  STRING_napla0
569
  STRING_naplb0
570
  STRING_nla0
571
  STRING_nlb0
572
  STRING_positive_lookahead0
573
  STRING_positive_lookbehind0
574
  STRING_non_atomic_positive_lookahead0
575
  STRING_non_atomic_positive_lookbehind0
576
  STRING_negative_lookahead0
577
  STRING_negative_lookbehind0
578
  STRING_scs0
579
  STRING_scan_substring0
580
  STRING_atomic0
581
  STRING_sr0
582
  STRING_asr0
583
  STRING_script_run0
584
  STRING_atomic_script_run;
585
586
static const alasitem alasmeta[] = {
587
  {  3, META_LOOKAHEAD         },
588
  {  3, META_LOOKBEHIND        },
589
  {  5, META_LOOKAHEAD_NA      },
590
  {  5, META_LOOKBEHIND_NA     },
591
  {  3, META_LOOKAHEADNOT      },
592
  {  3, META_LOOKBEHINDNOT     },
593
  { 18, META_LOOKAHEAD         },
594
  { 19, META_LOOKBEHIND        },
595
  { 29, META_LOOKAHEAD_NA      },
596
  { 30, META_LOOKBEHIND_NA     },
597
  { 18, META_LOOKAHEADNOT      },
598
  { 19, META_LOOKBEHINDNOT     },
599
  {  3, META_SCS               },
600
  { 14, META_SCS               },
601
  {  6, META_ATOMIC            },
602
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
603
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
604
  { 10, META_SCRIPT_RUN        }, /* script run */
605
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
606
};
607
608
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
609
610
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
611
612
static uint32_t chartypeoffset[] = {
613
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
614
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
615
616
/* Tables of names of POSIX character classes and their lengths. The names are
617
now all in a single string, to reduce the number of relocations when a shared
618
library is dynamically loaded. The list of lengths is terminated by a zero
619
length entry. The first three must be alpha, lower, upper, as this is assumed
620
for handling case independence.
621
622
The indices for several classes are stored in pcre2_compile.h - these must
623
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
624
and posix_substitutes. */
625
626
static const char posix_names[] =
627
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
628
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
629
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
630
  STRING_word0  STRING_xdigit;
631
632
static const uint8_t posix_name_lengths[] = {
633
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
634
635
/* Table of class bit maps for each POSIX class. Each class is formed from a
636
base map, with an optional addition or removal of another map. Then, for some
637
classes, there is some additional tweaking: for [:blank:] the vertical space
638
characters are removed, and for [:alpha:] and [:alnum:] the underscore
639
character is removed. The triples in the table consist of the base map offset,
640
second map offset or -1 if no second map, and a non-negative value for map
641
addition or a negative value for map subtraction (if there are two maps). The
642
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
643
remove vertical space characters, 2 => remove underscore. */
644
645
const int PRIV(posix_class_maps)[] = {
646
  cbit_word,   cbit_digit, -2,            /* alpha */
647
  cbit_lower,  -1,          0,            /* lower */
648
  cbit_upper,  -1,          0,            /* upper */
649
  cbit_word,   -1,          2,            /* alnum - word without underscore */
650
  cbit_print,  cbit_cntrl,  0,            /* ascii */
651
  cbit_space,  -1,          1,            /* blank - a GNU extension */
652
  cbit_cntrl,  -1,          0,            /* cntrl */
653
  cbit_digit,  -1,          0,            /* digit */
654
  cbit_graph,  -1,          0,            /* graph */
655
  cbit_print,  -1,          0,            /* print */
656
  cbit_punct,  -1,          0,            /* punct */
657
  cbit_space,  -1,          0,            /* space */
658
  cbit_word,   -1,          0,            /* word - a Perl extension */
659
  cbit_xdigit, -1,          0             /* xdigit */
660
};
661
662
#ifdef SUPPORT_UNICODE
663
664
/* The POSIX class Unicode property substitutes that are used in UCP mode must
665
be in the order of the POSIX class names, defined above. */
666
667
static int posix_substitutes[] = {
668
  PT_GC, ucp_L,     /* alpha */
669
  PT_PC, ucp_Ll,    /* lower */
670
  PT_PC, ucp_Lu,    /* upper */
671
  PT_ALNUM, 0,      /* alnum */
672
  -1, 0,            /* ascii, treat as non-UCP */
673
  -1, 1,            /* blank, treat as \h */
674
  PT_PC, ucp_Cc,    /* cntrl */
675
  PT_PC, ucp_Nd,    /* digit */
676
  PT_PXGRAPH, 0,    /* graph */
677
  PT_PXPRINT, 0,    /* print */
678
  PT_PXPUNCT, 0,    /* punct */
679
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
680
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
681
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
682
};
683
#endif  /* SUPPORT_UNICODE */
684
685
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
686
are allowed. */
687
688
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
689
82.9k
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
690
82.9k
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
691
82.9k
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
692
693
#define PUBLIC_COMPILE_OPTIONS \
694
82.9k
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
695
82.9k
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
696
82.9k
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
697
82.9k
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
698
82.9k
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
699
82.9k
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
700
82.9k
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
701
702
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
703
82.9k
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
704
82.9k
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
705
706
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
707
82.9k
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
708
82.9k
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
709
82.9k
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
710
82.9k
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
711
82.9k
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
712
82.9k
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
713
82.9k
    PCRE2_EXTRA_NEVER_CALLOUT)
714
715
/* This is a table of start-of-pattern options such as (*UTF) and settings such
716
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
717
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
718
generic and always supported. */
719
720
enum { PSO_OPT,     /* Value is an option bit */
721
       PSO_XOPT,    /* Value is an xoption bit */
722
       PSO_FLG,     /* Value is a flag bit */
723
       PSO_NL,      /* Value is a newline type */
724
       PSO_BSR,     /* Value is a \R type */
725
       PSO_LIMH,    /* Read integer value for heap limit */
726
       PSO_LIMM,    /* Read integer value for match limit */
727
       PSO_LIMD,    /* Read integer value for depth limit */
728
       PSO_OPTMZ    /* Value is an optimization bit */
729
     };
730
731
typedef struct pso {
732
  const char *name;
733
  uint16_t length;
734
  uint16_t type;
735
  uint32_t value;
736
} pso;
737
738
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
739
740
static const pso pso_list[] = {
741
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
742
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
743
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
744
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
745
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
746
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
747
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
748
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
749
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
750
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
751
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
752
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
753
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
754
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
755
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
756
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
757
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
758
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
759
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
760
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
761
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
762
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
763
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
764
};
765
766
/* This table is used when converting repeating opcodes into possessified
767
versions as a result of an explicit possessive quantifier such as ++. A zero
768
value means there is no possessified version - in those cases the item in
769
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
770
because all relevant opcodes are less than that. */
771
772
static const uint8_t opcode_possessify[] = {
773
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
774
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
775
776
  0,                       /* NOTI */
777
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
778
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
779
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
780
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
781
  0,                       /* EXACT */
782
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
783
784
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
785
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
786
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
787
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
788
  0,                       /* EXACTI */
789
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
790
791
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
792
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
793
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
794
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
795
  0,                       /* NOTEXACT */
796
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
797
798
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
799
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
800
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
801
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
802
  0,                       /* NOTEXACTI */
803
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
804
805
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
806
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
807
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
808
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
809
  0,                       /* TYPEEXACT */
810
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
811
812
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
813
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
814
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
815
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
816
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
817
818
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
819
  0, 0,                    /* REF, REFI */
820
  0, 0,                    /* DNREF, DNREFI */
821
  0, 0,                    /* RECURSE, CALLOUT */
822
};
823
824
/* Compile-time check that the table has the correct size. */
825
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
826
827
828
#ifdef DEBUG_SHOW_PARSED
829
/*************************************************
830
*     Show the parsed pattern for debugging      *
831
*************************************************/
832
833
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
834
can be enabled. */
835
836
static void show_parsed(compile_block *cb)
837
{
838
uint32_t *pptr = cb->parsed_pattern;
839
840
for (;;)
841
  {
842
  int max, min;
843
  PCRE2_SIZE offset;
844
  uint32_t i;
845
  uint32_t length;
846
  uint32_t meta_arg = META_DATA(*pptr);
847
848
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
849
850
  if (*pptr < META_END)
851
    {
852
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
853
    pptr++;
854
    }
855
856
  else switch (META_CODE(*pptr++))
857
    {
858
    default:
859
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
860
    return;
861
862
    case META_END:
863
    fprintf(stderr, "META_END\n");
864
    return;
865
866
    case META_CAPTURE:
867
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
868
    break;
869
870
    case META_RECURSE:
871
    GETOFFSET(offset, pptr);
872
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
873
    break;
874
875
    case META_BACKREF:
876
    if (meta_arg < 10)
877
      offset = cb->small_ref_offset[meta_arg];
878
    else
879
      GETOFFSET(offset, pptr);
880
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
881
    break;
882
883
    case META_ESCAPE:
884
    if (meta_arg == ESC_P || meta_arg == ESC_p)
885
      {
886
      uint32_t ptype = *pptr >> 16;
887
      uint32_t pvalue = *pptr++ & 0xffff;
888
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
889
        ptype, pvalue);
890
      }
891
    else
892
      {
893
      uint32_t cc;
894
      /* There's just one escape we might have here that isn't negated in the
895
      escapes table. */
896
      if (meta_arg == ESC_g) cc = CHAR_g;
897
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
898
        {
899
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
900
        }
901
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
902
      fprintf(stderr, "META \\%c", cc);
903
      }
904
    break;
905
906
    case META_MINMAX:
907
    min = *pptr++;
908
    max = *pptr++;
909
    if (max != REPEAT_UNLIMITED)
910
      fprintf(stderr, "META {%d,%d}", min, max);
911
    else
912
      fprintf(stderr, "META {%d,}", min);
913
    break;
914
915
    case META_MINMAX_QUERY:
916
    min = *pptr++;
917
    max = *pptr++;
918
    if (max != REPEAT_UNLIMITED)
919
      fprintf(stderr, "META {%d,%d}?", min, max);
920
    else
921
      fprintf(stderr, "META {%d,}?", min);
922
    break;
923
924
    case META_MINMAX_PLUS:
925
    min = *pptr++;
926
    max = *pptr++;
927
    if (max != REPEAT_UNLIMITED)
928
      fprintf(stderr, "META {%d,%d}+", min, max);
929
    else
930
      fprintf(stderr, "META {%d,}+", min);
931
    break;
932
933
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
934
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
935
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
936
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
937
    case META_DOT: fprintf(stderr, "META_DOT"); break;
938
    case META_ASTERISK: fprintf(stderr, "META *"); break;
939
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
940
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
941
    case META_PLUS: fprintf(stderr, "META +"); break;
942
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
943
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
944
    case META_QUERY: fprintf(stderr, "META ?"); break;
945
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
946
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
947
948
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
949
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
950
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
951
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
952
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
953
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
954
    case META_KET: fprintf(stderr, "META )"); break;
955
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
956
957
    case META_CLASS: fprintf(stderr, "META ["); break;
958
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
959
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
960
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
961
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
962
963
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
964
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
965
966
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
967
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
968
969
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
970
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
971
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
972
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
973
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
974
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
975
976
    case META_OPTIONS:
977
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
978
    pptr += 2;
979
    break;
980
981
    case META_LOOKBEHIND:
982
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
983
    pptr += 2;
984
    break;
985
986
    case META_LOOKBEHIND_NA:
987
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
988
    pptr += 2;
989
    break;
990
991
    case META_LOOKBEHINDNOT:
992
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
993
    pptr += 2;
994
    break;
995
996
    case META_CALLOUT_NUMBER:
997
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
998
       pptr[1]);
999
    pptr += 3;
1000
    break;
1001
1002
    case META_CALLOUT_STRING:
1003
      {
1004
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1005
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1006
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1007
      GETOFFSET(offset, pptr);
1008
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1009
      }
1010
    break;
1011
1012
    case META_RECURSE_BYNAME:
1013
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1014
    GETOFFSET(offset, pptr);
1015
    fprintf(stderr, "%zd", offset);
1016
    break;
1017
1018
    case META_BACKREF_BYNAME:
1019
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1020
    GETOFFSET(offset, pptr);
1021
    fprintf(stderr, "%zd", offset);
1022
    break;
1023
1024
    case META_COND_NUMBER:
1025
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    pptr++;
1029
    break;
1030
1031
    case META_COND_DEFINE:
1032
    fprintf(stderr, "META (?(DEFINE) offset=");
1033
    GETOFFSET(offset, pptr);
1034
    fprintf(stderr, "%zd", offset);
1035
    break;
1036
1037
    case META_COND_VERSION:
1038
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1039
    fprintf(stderr, "%d.", *pptr++);
1040
    fprintf(stderr, "%d)", *pptr++);
1041
    break;
1042
1043
    case META_COND_NAME:
1044
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1045
    GETOFFSET(offset, pptr);
1046
    fprintf(stderr, "%zd", offset);
1047
    break;
1048
1049
    case META_COND_RNAME:
1050
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1051
    GETOFFSET(offset, pptr);
1052
    fprintf(stderr, "%zd", offset);
1053
    break;
1054
1055
    /* This is kept as a name, because it might be. */
1056
1057
    case META_COND_RNUMBER:
1058
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1059
    GETOFFSET(offset, pptr);
1060
    fprintf(stderr, "%zd", offset);
1061
    break;
1062
1063
    case META_OFFSET:
1064
    fprintf(stderr, "META_OFFSET offset=");
1065
    GETOFFSET(offset, pptr);
1066
    fprintf(stderr, "%zd", offset);
1067
    break;
1068
1069
    case META_SCS:
1070
    fprintf(stderr, "META (*scan_substring:");
1071
    break;
1072
1073
    case META_CAPTURE_NAME:
1074
    fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1075
    break;
1076
1077
    case META_CAPTURE_NUMBER:
1078
    fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1079
    break;
1080
1081
    case META_MARK:
1082
    fprintf(stderr, "META (*MARK:");
1083
    goto SHOWARG;
1084
1085
    case META_COMMIT_ARG:
1086
    fprintf(stderr, "META (*COMMIT:");
1087
    goto SHOWARG;
1088
1089
    case META_PRUNE_ARG:
1090
    fprintf(stderr, "META (*PRUNE:");
1091
    goto SHOWARG;
1092
1093
    case META_SKIP_ARG:
1094
    fprintf(stderr, "META (*SKIP:");
1095
    goto SHOWARG;
1096
1097
    case META_THEN_ARG:
1098
    fprintf(stderr, "META (*THEN:");
1099
    SHOWARG:
1100
    length = *pptr++;
1101
    for (i = 0; i < length; i++)
1102
      {
1103
      uint32_t cc = *pptr++;
1104
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1105
        else fprintf(stderr, "\\x{%x}", cc);
1106
      }
1107
    fprintf(stderr, ") length=%u", length);
1108
    break;
1109
1110
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1111
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1112
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1113
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1114
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1115
    }
1116
  fprintf(stderr, "\n");
1117
  }
1118
return;
1119
}
1120
#endif  /* DEBUG_SHOW_PARSED */
1121
1122
1123
1124
/*************************************************
1125
*               Copy compiled code               *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. */
1130
1131
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1132
pcre2_code_copy(const pcre2_code *code)
1133
0
{
1134
0
PCRE2_SIZE *ref_count;
1135
0
pcre2_code *newcode;
1136
1137
0
if (code == NULL) return NULL;
1138
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1139
0
if (newcode == NULL) return NULL;
1140
0
memcpy(newcode, code, code->blocksize);
1141
0
newcode->executable_jit = NULL;
1142
1143
/* If the code is one that has been deserialized, increment the reference count
1144
in the decoded tables. */
1145
1146
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1147
0
  {
1148
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1149
0
  (*ref_count)++;
1150
0
  }
1151
1152
0
return newcode;
1153
0
}
1154
1155
1156
1157
/*************************************************
1158
*     Copy compiled code and character tables    *
1159
*************************************************/
1160
1161
/* Compiled JIT code cannot be copied, so the new compiled block has no
1162
associated JIT data. This version of code_copy also makes a separate copy of
1163
the character tables. */
1164
1165
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1166
pcre2_code_copy_with_tables(const pcre2_code *code)
1167
0
{
1168
0
PCRE2_SIZE* ref_count;
1169
0
pcre2_code *newcode;
1170
0
uint8_t *newtables;
1171
1172
0
if (code == NULL) return NULL;
1173
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1174
0
if (newcode == NULL) return NULL;
1175
0
memcpy(newcode, code, code->blocksize);
1176
0
newcode->executable_jit = NULL;
1177
1178
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1179
0
  code->memctl.memory_data);
1180
0
if (newtables == NULL)
1181
0
  {
1182
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1183
0
  return NULL;
1184
0
  }
1185
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1186
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1187
0
*ref_count = 1;
1188
1189
0
newcode->tables = newtables;
1190
0
newcode->flags |= PCRE2_DEREF_TABLES;
1191
0
return newcode;
1192
0
}
1193
1194
1195
1196
/*************************************************
1197
*               Free compiled code               *
1198
*************************************************/
1199
1200
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1201
pcre2_code_free(pcre2_code *code)
1202
82.9k
{
1203
82.9k
PCRE2_SIZE* ref_count;
1204
1205
82.9k
if (code != NULL)
1206
72.9k
  {
1207
72.9k
#ifdef SUPPORT_JIT
1208
72.9k
  if (code->executable_jit != NULL)
1209
67.4k
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1210
72.9k
#endif
1211
1212
72.9k
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1213
0
    {
1214
    /* Decoded tables belong to the codes after deserialization, and they must
1215
    be freed when there are no more references to them. The *ref_count should
1216
    always be > 0. */
1217
1218
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1219
0
    if (*ref_count > 0)
1220
0
      {
1221
0
      (*ref_count)--;
1222
0
      if (*ref_count == 0)
1223
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1224
0
      }
1225
0
    }
1226
1227
72.9k
  code->memctl.free(code, code->memctl.memory_data);
1228
72.9k
  }
1229
82.9k
}
1230
1231
1232
1233
/*************************************************
1234
*         Read a number, possibly signed         *
1235
*************************************************/
1236
1237
/* This function is used to read numbers in the pattern. The initial pointer
1238
must be at the sign or first digit of the number. When relative values
1239
(introduced by + or -) are allowed, they are relative group numbers, and the
1240
result must be greater than zero.
1241
1242
Arguments:
1243
  ptrptr      points to the character pointer variable
1244
  ptrend      points to the end of the input string
1245
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1246
  max_value   the largest number allowed;
1247
              you must not pass a value for max_value larger than
1248
              INT_MAX/10 - 1 because this function relies on max_value to
1249
              avoid integer overflow
1250
  max_error   the error to give for an over-large number
1251
  intptr      where to put the result
1252
  errcodeptr  where to put an error code
1253
1254
Returns:      TRUE  - a number was read
1255
              FALSE - errorcode == 0 => no number was found
1256
                      errorcode != 0 => an error occurred
1257
*/
1258
1259
static BOOL
1260
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1261
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1262
202k
{
1263
202k
int sign = 0;
1264
202k
uint32_t n = 0;
1265
202k
PCRE2_SPTR ptr = *ptrptr;
1266
202k
BOOL yield = FALSE;
1267
1268
202k
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1269
1270
202k
*errorcodeptr = 0;
1271
1272
202k
if (allow_sign >= 0 && ptr < ptrend)
1273
21.2k
  {
1274
21.2k
  if (*ptr == CHAR_PLUS)
1275
451
    {
1276
451
    sign = +1;
1277
451
    max_value -= allow_sign;
1278
451
    ptr++;
1279
451
    }
1280
20.7k
  else if (*ptr == CHAR_MINUS)
1281
9.40k
    {
1282
9.40k
    sign = -1;
1283
9.40k
    ptr++;
1284
9.40k
    }
1285
21.2k
  }
1286
1287
202k
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1288
427k
while (ptr < ptrend && IS_DIGIT(*ptr))
1289
255k
  {
1290
255k
  n = n * 10 + (*ptr++ - CHAR_0);
1291
255k
  if (n > max_value)
1292
500
    {
1293
500
    *errorcodeptr = max_error;
1294
1.84k
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1295
500
    goto EXIT;
1296
500
    }
1297
255k
  }
1298
1299
171k
if (allow_sign >= 0 && sign != 0)
1300
9.85k
  {
1301
9.85k
  if (n == 0)
1302
12
    {
1303
12
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1304
12
    goto EXIT;
1305
12
    }
1306
1307
9.83k
  if (sign > 0) n += allow_sign;
1308
9.39k
  else if (n > (uint32_t)allow_sign)
1309
24
    {
1310
24
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1311
24
    goto EXIT;
1312
24
    }
1313
9.37k
  else n = allow_sign + 1 - n;
1314
9.83k
  }
1315
1316
171k
yield = TRUE;
1317
1318
172k
EXIT:
1319
172k
*intptr = n;
1320
172k
*ptrptr = ptr;
1321
172k
return yield;
1322
171k
}
1323
1324
1325
1326
/*************************************************
1327
*         Read repeat counts                     *
1328
*************************************************/
1329
1330
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1331
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1332
larger value is used for "unlimited". We have to use signed arguments for
1333
read_number() because it is capable of returning a signed value. As of Perl
1334
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1335
tabs after { and before } and between the numbers and the comma, so we do too.
1336
1337
Arguments:
1338
  ptrptr         points to pointer to character after '{'
1339
  ptrend         pointer to end of input
1340
  minp           if not NULL, pointer to int for min
1341
  maxp           if not NULL, pointer to int for max
1342
  errorcodeptr   points to error code variable
1343
1344
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1345
                 FALSE on error, with errorcode set non-zero
1346
                 TRUE on success, with pointer updated to point after '}'
1347
*/
1348
1349
static BOOL
1350
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1351
  uint32_t *maxp, int *errorcodeptr)
1352
161k
{
1353
161k
PCRE2_SPTR p = *ptrptr;
1354
161k
PCRE2_SPTR pp;
1355
161k
BOOL yield = FALSE;
1356
161k
BOOL had_minimum = FALSE;
1357
161k
int32_t min = 0;
1358
161k
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1359
1360
161k
*errorcodeptr = 0;
1361
165k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1362
1363
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1364
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1365
error. */
1366
1367
161k
pp = p;
1368
161k
if (pp < ptrend && IS_DIGIT(*pp))
1369
115k
  {
1370
115k
  had_minimum = TRUE;
1371
182k
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1372
115k
  }
1373
1374
170k
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1375
161k
if (pp >= ptrend) return FALSE;
1376
1377
161k
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1378
57.4k
  {
1379
57.4k
  if (!had_minimum) return FALSE;
1380
57.4k
  }
1381
103k
else
1382
103k
  {
1383
103k
  if (*pp++ != CHAR_COMMA) return FALSE;
1384
62.3k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1385
55.7k
  if (pp >= ptrend) return FALSE;
1386
55.7k
  if (IS_DIGIT(*pp))
1387
42.7k
    {
1388
54.7k
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1389
42.7k
    }
1390
12.9k
  else if (!had_minimum) return FALSE;
1391
57.7k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1392
53.7k
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1393
53.7k
  }
1394
1395
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1396
or {n,m}. The only error that read_number() can return is for a number that is
1397
too big. If *errorcodeptr is returned as zero it means no number was found. */
1398
1399
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1400
check m >= n because n defaults to zero. */
1401
1402
101k
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1403
14.5k
  {
1404
14.5k
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1405
14.4k
  p++;  /* Skip comma and subsequent spaces */
1406
15.9k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1407
14.4k
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1408
15
    {
1409
15
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1410
15
    }
1411
14.4k
  }
1412
1413
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1414
1415
87.2k
else
1416
87.2k
  {
1417
92.1k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1418
87.2k
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1419
57.0k
    {
1420
57.0k
    max = min;
1421
57.0k
    }
1422
30.1k
  else   /* Handle {n,} or {n,m} */
1423
30.1k
    {
1424
30.1k
    p++;    /* Skip comma and subsequent spaces */
1425
34.3k
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1426
30.1k
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1427
6.89k
      {
1428
6.89k
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1429
6.89k
      }
1430
1431
30.1k
    if (max < min)
1432
18
      {
1433
18
      *errorcodeptr = ERR4;
1434
18
      goto EXIT;
1435
18
      }
1436
30.1k
    }
1437
87.2k
  }
1438
1439
/* Valid quantifier exists */
1440
1441
103k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1442
101k
p++;
1443
101k
yield = TRUE;
1444
101k
if (minp != NULL) *minp = (uint32_t)min;
1445
101k
if (maxp != NULL) *maxp = (uint32_t)max;
1446
1447
/* Update the pattern pointer */
1448
1449
101k
EXIT:
1450
101k
*ptrptr = p;
1451
101k
return yield;
1452
101k
}
1453
1454
1455
1456
/*************************************************
1457
*            Handle escapes                      *
1458
*************************************************/
1459
1460
/* This function is called when a \ has been encountered. It either returns a
1461
positive value for a simple escape such as \d, or 0 for a data character, which
1462
is placed in chptr. A backreference to group n is returned as -(n+1). On
1463
entry, ptr is pointing at the character after \. On exit, it points after the
1464
final code unit of the escape sequence.
1465
1466
This function is also called from pcre2_substitute() to handle escape sequences
1467
in replacement strings. In this case, the cb argument is NULL, and in the case
1468
of escapes that have further processing, only sequences that define a data
1469
character are recognised. The options argument is the final value of the
1470
compiled pattern's options.
1471
1472
Arguments:
1473
  ptrptr         points to the input position pointer
1474
  ptrend         points to the end of the input
1475
  chptr          points to a returned data character
1476
  errorcodeptr   points to the errorcode variable (containing zero)
1477
  options        the current options bits
1478
  xoptions       the current extra options bits
1479
  bracount       the number of capturing parentheses encountered so far
1480
  isclass        TRUE if in a character class
1481
  cb             compile data block or NULL when called from pcre2_substitute()
1482
1483
Returns:         zero => a data character
1484
                 positive => a special escape sequence
1485
                 negative => a numerical back reference
1486
                 on error, errorcodeptr is set non-zero
1487
*/
1488
1489
int
1490
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1491
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1492
  BOOL isclass, compile_block *cb)
1493
281k
{
1494
281k
BOOL utf = (options & PCRE2_UTF) != 0;
1495
281k
BOOL alt_bsux =
1496
281k
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1497
281k
PCRE2_SPTR ptr = *ptrptr;
1498
281k
uint32_t c, cc;
1499
281k
int escape = 0;
1500
281k
int i;
1501
1502
/* If backslash is at the end of the string, it's an error. */
1503
1504
281k
if (ptr >= ptrend)
1505
58
  {
1506
58
  *errorcodeptr = ERR1;
1507
58
  return 0;
1508
58
  }
1509
1510
281k
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1511
281k
*errorcodeptr = 0;              /* Be optimistic */
1512
1513
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1514
value test saves a memory lookup for code points outside the alphanumeric
1515
range. */
1516
1517
281k
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1518
1519
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1520
positive value is a literal value for something like \n. A negative value is
1521
the negation of one of the ESC_ macros that is passed back for handling by the
1522
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1523
is supported. If the value is zero, further processing is handled below. */
1524
1525
217k
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1526
190k
  {
1527
190k
  if (i > 0)
1528
20.9k
    {
1529
20.9k
    c = (uint32_t)i;
1530
20.9k
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1531
0
      c = CHAR_LF;
1532
20.9k
    }
1533
169k
  else  /* Negative table entry */
1534
169k
    {
1535
169k
    escape = -i;                    /* Else return a special escape */
1536
169k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1537
16.3k
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1538
1539
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1540
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1541
    support \N{name}. However, it does support quantification such as \N{2,3},
1542
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1543
1544
169k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1545
495
      {
1546
495
      PCRE2_SPTR p = ptr + 1;
1547
1548
      /* Perl ignores spaces and tabs after { */
1549
1550
879
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1551
1552
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1553
      not valid in EBCDIC environments because it specifies a Unicode
1554
      character, not a codepoint in the local code. For example \N{U+0041}
1555
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1556
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1557
      Unicode) mode. */
1558
1559
495
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1560
145
        {
1561
145
#ifndef EBCDIC
1562
145
        if (utf)
1563
83
          {
1564
83
          ptr = p + 2;
1565
83
          escape = 0;   /* Not a fancy escape after all */
1566
83
          goto COME_FROM_NU;
1567
83
          }
1568
62
#endif
1569
1570
        /* Improve error offset. */
1571
62
        ptr = p + 2;
1572
153
        while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1573
256
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1574
62
        if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET) ptr++;
1575
1576
62
        *errorcodeptr = ERR93;
1577
62
        }
1578
1579
      /* Give an error in contexts where quantifiers are not allowed
1580
      (character classes; substitution strings). */
1581
1582
350
      else if (isclass || cb == NULL)
1583
5
        {
1584
5
        ptr++; /* Skip over the opening brace */
1585
5
        *errorcodeptr = ERR37;
1586
5
        }
1587
1588
      /* Give an error if what follows is not a quantifier, but don't override
1589
      an error set by the quantifier reader (e.g. number overflow). */
1590
1591
345
      else
1592
345
        {
1593
345
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1594
65
             *errorcodeptr == 0)
1595
52
          {
1596
52
          ptr++; /* Skip over the opening brace */
1597
52
          *errorcodeptr = ERR37;
1598
52
          }
1599
345
        }
1600
495
      }
1601
169k
    }
1602
190k
  }
1603
1604
/* Escapes that need further processing, including those that are unknown, have
1605
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1606
\o, and \x are recognized (\u and \U can never appear as they are used for case
1607
forcing). */
1608
1609
26.3k
else
1610
26.3k
  {
1611
26.3k
  int s;
1612
26.3k
  PCRE2_SPTR oldptr;
1613
26.3k
  BOOL overflow;
1614
1615
  /* Filter calls from pcre2_substitute(). */
1616
1617
26.3k
  if (cb == NULL)
1618
0
    {
1619
0
    if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&
1620
0
        c != CHAR_x && c != CHAR_g)
1621
0
      {
1622
0
      *errorcodeptr = ERR3;
1623
0
      goto EXIT;
1624
0
      }
1625
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1626
0
    }
1627
1628
26.3k
  switch (c)
1629
26.3k
    {
1630
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1631
    error. */
1632
1633
3
    case CHAR_F:
1634
6
    case CHAR_l:
1635
9
    case CHAR_L:
1636
9
    *errorcodeptr = ERR37;
1637
9
    break;
1638
1639
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1640
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1641
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1642
    Otherwise it is a lowercase u letter. This gives some compatibility with
1643
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1644
    allowed. When \u{ is not followed by hex digits, a special return is given
1645
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1646
1647
3.35k
    case CHAR_u:
1648
3.35k
    if (!alt_bsux)
1649
133
      *errorcodeptr = ERR37;
1650
3.21k
    else
1651
3.21k
      {
1652
3.21k
      uint32_t xc;
1653
1654
3.21k
      if (ptr >= ptrend) break;
1655
3.21k
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1656
203
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1657
0
        {
1658
0
        PCRE2_SPTR hptr = ptr + 1;
1659
1660
0
        cc = 0;
1661
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1662
0
          {
1663
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1664
0
            {
1665
0
            *errorcodeptr = ERR77;
1666
0
            ptr = hptr;   /* Show where */
1667
0
            break;        /* *hptr != } will cause another break below */
1668
0
            }
1669
0
          cc = (cc << 4) | xc;
1670
0
          hptr++;
1671
0
          }
1672
1673
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1674
0
            hptr >= ptrend ||    /* Hit end of input */
1675
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1676
0
          {
1677
0
          if (isclass) break; /* In a class, just treat as '\u' literal */
1678
0
          escape = ESC_ub;    /* Special return */
1679
0
          ptr++;              /* Skip { */
1680
0
          break;              /* Hex escape not recognized */
1681
0
          }
1682
1683
0
        c = cc;          /* Accept the code point */
1684
0
        ptr = hptr + 1;
1685
0
        }
1686
1687
3.21k
      else  /* Must be exactly 4 hex digits */
1688
3.21k
        {
1689
3.21k
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1690
3.18k
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1691
2.32k
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1692
1.91k
        cc = (cc << 4) | xc;
1693
1.91k
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1694
1.47k
        cc = (cc << 4) | xc;
1695
1.47k
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1696
741
        c = (cc << 4) | xc;
1697
741
        ptr += 4;
1698
741
        }
1699
1700
741
      if (utf)
1701
393
        {
1702
393
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1703
393
        else
1704
393
          if (c >= 0xd800 && c <= 0xdfff &&
1705
1
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1706
1
                *errorcodeptr = ERR73;
1707
393
        }
1708
348
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1709
741
      }
1710
874
    break;
1711
1712
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1713
    in which case it is an upper case letter. */
1714
1715
874
    case CHAR_U:
1716
237
    if (!alt_bsux) *errorcodeptr = ERR37;
1717
237
    break;
1718
1719
    /* In a character class, \g is just a literal "g". Outside a character
1720
    class, \g must be followed by one of a number of specific things:
1721
1722
    (1) A number, either plain or braced. If positive, it is an absolute
1723
    backreference. If negative, it is a relative backreference. This is a Perl
1724
    5.10 feature.
1725
1726
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1727
    is part of Perl's movement towards a unified syntax for back references. As
1728
    this is synonymous with \k{name}, we fudge it up by pretending it really
1729
    was \k{name}.
1730
1731
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1732
    number either in angle brackets or in single quotes. However, these are
1733
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1734
    the ESC_g code.
1735
1736
    Summary: Return a negative number for a numerical back reference (offset
1737
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1738
    numbered subroutine call.
1739
1740
    The above describes the \g behaviour inside patterns. Inside replacement
1741
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1742
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1743
    numbered case.
1744
    */
1745
1746
2.26k
    case CHAR_g:
1747
2.26k
    if (isclass) break;
1748
1749
2.04k
    if (ptr >= ptrend)
1750
3
      {
1751
3
      *errorcodeptr = ERR57;
1752
3
      break;
1753
3
      }
1754
1755
2.04k
    if (cb == NULL)
1756
0
      {
1757
0
      PCRE2_SPTR p;
1758
      /* Substitution strings */
1759
0
      if (*ptr != CHAR_LESS_THAN_SIGN)
1760
0
        {
1761
0
        *errorcodeptr = ERR57;
1762
0
        break;
1763
0
        }
1764
1765
0
      p = ptr + 1;
1766
1767
0
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1768
0
          errorcodeptr))
1769
0
        {
1770
0
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1771
0
        break;
1772
0
        }
1773
1774
0
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1775
0
        {
1776
0
        ptr = p;
1777
0
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1778
0
        break;
1779
0
        }
1780
1781
      /* This is the reason that back references are returned as -(s+1) rather
1782
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1783
      valid in a substitution string, so this must be representable. */
1784
0
      ptr = p + 1;
1785
0
      escape = -(s+1);
1786
0
      break;
1787
0
      }
1788
1789
2.04k
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
671
      {
1791
671
      escape = ESC_g;
1792
671
      break;
1793
671
      }
1794
1795
    /* If there is a brace delimiter, try to read a numerical reference. If
1796
    there isn't one, assume we have a name and treat it as \k. */
1797
1798
1.37k
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
1.07k
      {
1800
1.07k
      PCRE2_SPTR p = ptr + 1;
1801
1802
1.62k
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
1.07k
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
1.07k
          errorcodeptr))
1805
751
        {
1806
751
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807
751
        break;
1808
751
        }
1809
1.33k
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
328
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
41
        {
1813
41
        ptr = p;
1814
41
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1815
41
        break;
1816
41
        }
1817
287
      ptr = p + 1;
1818
287
      }
1819
1820
    /* Read an undelimited number */
1821
1822
293
    else
1823
293
      {
1824
293
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1825
293
          errorcodeptr))
1826
34
        {
1827
34
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1828
34
        break;
1829
34
        }
1830
293
      }
1831
1832
546
    if (s <= 0)
1833
3
      {
1834
3
      *errorcodeptr = ERR15;
1835
3
      break;
1836
3
      }
1837
1838
543
    escape = -(s+1);
1839
543
    break;
1840
1841
    /* The handling of escape sequences consisting of a string of digits
1842
    starting with one that is not zero is not straightforward. Perl has changed
1843
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1844
    recommended to avoid the ambiguities in the old syntax.
1845
1846
    Outside a character class, the digits are read as a decimal number. If the
1847
    number is less than 10, or if there are that many previous extracting left
1848
    brackets, it is a back reference. Otherwise, up to three octal digits are
1849
    read to form an escaped character code. Thus \123 is likely to be octal 123
1850
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1851
    style" of handling ambiguous octal/backrefences such as \12.
1852
1853
    There is an alternative disambiguation strategy, selected by
1854
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1855
    have either a leading zero, or exactly three octal digits; otherwise it's
1856
    a backreference. The disambiguation is stable, and does not depend on how
1857
    many capture groups are defined (it's simply an invalid backreference if
1858
    there is no corresponding capture group). Additionally, octal values above
1859
    \377 (\xff) are rejected.
1860
1861
    Inside a character class, \ followed by a digit is always either a literal
1862
    8 or 9 or an octal number. */
1863
1864
11.8k
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1865
14.1k
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1866
1867
14.1k
    if (isclass)
1868
764
      {
1869
      /* Fall through to octal handling; never a backreference inside a class. */
1870
764
      }
1871
13.3k
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1872
0
      {
1873
      /* Python-style disambiguation. */
1874
0
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1875
0
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1876
0
        {
1877
        /* We peeked a three-digit octal, so fall through */
1878
0
        }
1879
0
      else
1880
0
        {
1881
        /* We are at a digit, so the only possible error from read_number() is
1882
        a number that is too large. */
1883
0
        ptr--;   /* Back to the digit */
1884
1885
0
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1886
0
          {
1887
0
          *errorcodeptr = ERR61;
1888
0
          break;
1889
0
          }
1890
1891
0
        escape = -(s+1);
1892
0
        break;
1893
0
        }
1894
0
      }
1895
13.3k
    else
1896
13.3k
      {
1897
      /* Perl-style disambiguation. */
1898
13.3k
      oldptr = ptr;
1899
13.3k
      ptr--;   /* Back to the digit */
1900
1901
      /* As we know we are at a digit, the only possible error from
1902
      read_number() is a number that is too large to be a group number. Because
1903
      that number might be still valid if read as an octal, errorcodeptr is not
1904
      set on failure and therefore a sentinel value of INT_MAX is used instead
1905
      of the original value, and will be used later to properly set the error,
1906
      if not falling through. */
1907
1908
13.3k
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1909
381
        s = INT_MAX;
1910
1911
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1912
      are octal escapes if there are not that many previous captures. */
1913
1914
13.3k
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1915
11.4k
        {
1916
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1917
        but we keep it just to be safe and because it will also catch the
1918
        sentinel value that was set on failure by that function. */
1919
1920
11.4k
        if ((unsigned)s > MAX_GROUP_NUMBER)
1921
8
          {
1922
8
          PCRE2_ASSERT(s == INT_MAX);
1923
8
          *errorcodeptr = ERR61;
1924
8
          }
1925
11.4k
        else escape = -(s+1);     /* Indicates a back reference */
1926
11.4k
        break;
1927
11.4k
        }
1928
1929
1.86k
      ptr = oldptr;      /* Put the pointer back and fall through */
1930
1.86k
      }
1931
1932
    /* Handle a digit following \ when the number is not a back reference, or
1933
    we are within a character class. If the first digit is 8 or 9, Perl used to
1934
    generate a binary zero and then treat the digit as a following literal. At
1935
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1936
1937
2.63k
    if (c >= CHAR_8) break;
1938
1939
2.30k
    PCRE2_FALLTHROUGH /* Fall through */
1940
2.30k
1941
2.30k
    /* \0 always starts an octal number, but we may drop through to here with a
1942
2.30k
    larger first octal digit. The original code used just to take the least
1943
2.30k
    significant 8 bits of octal numbers (I think this is what early Perls used
1944
2.30k
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1945
2.30k
    but no more than 3 octal digits. */
1946
2.30k
1947
3.34k
    case CHAR_0:
1948
3.34k
    c -= CHAR_0;
1949
6.79k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1950
3.45k
        c = c * 8 + *ptr++ - CHAR_0;
1951
3.34k
    if (c > 0xff)
1952
794
      {
1953
794
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1954
#if PCRE2_CODE_UNIT_WIDTH == 8
1955
      else if (!utf) *errorcodeptr = ERR51;
1956
#endif
1957
794
      }
1958
1959
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1960
    two- or three-character octal escapes \00 and \000, nor \x00. */
1961
1962
3.34k
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1963
0
        *errorcodeptr = ERR98;
1964
3.34k
    break;
1965
1966
    /* \o is a relatively new Perl feature, supporting a more general way of
1967
    specifying character codes in octal. The only supported form is \o{ddd},
1968
    with optional spaces or tabs after { and before }. */
1969
1970
1.32k
    case CHAR_o:
1971
1.32k
    if (ptr >= ptrend || *ptr != CHAR_LEFT_CURLY_BRACKET)
1972
8
      {
1973
8
      *errorcodeptr = ERR55;
1974
8
      break;
1975
8
      }
1976
1.31k
    ptr++;
1977
1978
2.01k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1979
1.31k
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1980
26
      {
1981
26
      *errorcodeptr = ERR78;
1982
26
      break;
1983
26
      }
1984
1985
1.28k
    c = 0;
1986
1.28k
    overflow = FALSE;
1987
4.05k
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1988
2.81k
      {
1989
2.81k
      cc = *ptr++;
1990
2.81k
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1991
#if PCRE2_CODE_UNIT_WIDTH == 32
1992
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1993
#endif
1994
2.57k
      c = (c << 3) + (cc - CHAR_0);
1995
#if PCRE2_CODE_UNIT_WIDTH == 8
1996
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1997
#elif PCRE2_CODE_UNIT_WIDTH == 16
1998
2.57k
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1999
#elif PCRE2_CODE_UNIT_WIDTH == 32
2000
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2001
#endif
2002
2.57k
      }
2003
2004
3.03k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2005
2006
1.28k
    if (overflow)
2007
49
      {
2008
265
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2009
49
      *errorcodeptr = ERR34;
2010
49
      }
2011
1.24k
    else if (utf && c >= 0xd800 && c <= 0xdfff &&
2012
2
             (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2013
2
      {
2014
2
      *errorcodeptr = ERR73;
2015
2
      }
2016
1.23k
    else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2017
1.14k
      {
2018
1.14k
      ptr++;
2019
1.14k
      }
2020
90
    else
2021
90
      {
2022
90
      *errorcodeptr = ERR64;
2023
90
      goto ESCAPE_FAILED_FORWARD;
2024
90
      }
2025
1.19k
    break;
2026
2027
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
2028
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
2029
2030
3.41k
    case CHAR_x:
2031
3.41k
    if (alt_bsux)
2032
1.21k
      {
2033
1.21k
      uint32_t xc;
2034
1.21k
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
2035
1.21k
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
2036
579
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2037
198
      c = (cc << 4) | xc;
2038
198
      ptr += 2;
2039
198
      }
2040
2041
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
2042
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2043
    digits. If not, { used to be treated as a data character. However, Perl
2044
    seems to read hex digits up to the first non-such, and ignore the rest, so
2045
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2046
    now gives an error. */
2047
2048
2.20k
    else
2049
2.20k
      {
2050
2.20k
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2051
1.41k
        {
2052
1.41k
        ptr++;
2053
1.86k
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2054
2055
1.41k
#ifndef EBCDIC
2056
1.50k
        COME_FROM_NU:
2057
1.50k
#endif
2058
1.50k
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2059
28
          {
2060
28
          *errorcodeptr = ERR78;
2061
28
          break;
2062
28
          }
2063
1.47k
        c = 0;
2064
1.47k
        overflow = FALSE;
2065
2066
4.72k
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2067
3.29k
          {
2068
3.29k
          ptr++;
2069
3.29k
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2070
#if PCRE2_CODE_UNIT_WIDTH == 32
2071
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2072
#endif
2073
2.76k
          c = (c << 4) | cc;
2074
2.76k
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2075
41
            {
2076
41
            overflow = TRUE;
2077
41
            break;
2078
41
            }
2079
2.76k
          }
2080
2081
        /* Perl ignores spaces and tabs before } */
2082
2083
2.47k
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2084
2085
        /* On overflow, skip remaining hex digits */
2086
2087
1.47k
        if (overflow)
2088
41
          {
2089
284
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2090
41
          *errorcodeptr = ERR34;
2091
41
          }
2092
1.43k
        else if (utf && c >= 0xd800 && c <= 0xdfff &&
2093
1
                 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2094
1
          {
2095
1
          *errorcodeptr = ERR73;
2096
1
          }
2097
1.43k
        else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2098
1.32k
          {
2099
1.32k
          ptr++;
2100
1.32k
          }
2101
2102
        /* If the sequence of hex digits (followed by optional space) does not
2103
        end with '}', give an error. We used just to recognize this construct
2104
        and fall through to the normal \x handling, but nowadays Perl gives an
2105
        error, which seems much more sensible, so we do too. */
2106
2107
107
        else
2108
107
          {
2109
107
          *errorcodeptr = ERR67;
2110
107
          goto ESCAPE_FAILED_FORWARD;
2111
107
          }
2112
1.47k
        }   /* End of \x{} processing */
2113
2114
      /* Read a up to two hex digits after \x */
2115
2116
783
      else
2117
783
        {
2118
        /* Perl has the surprising/broken behaviour that \x without following
2119
        hex digits is treated as an escape for NUL. Their source code laments
2120
        this but keeps it for backwards compatibility. A warning is printed
2121
        when "use warnings" is enabled. Because we don't have warnings, we
2122
        simply forbid it. */
2123
783
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2124
60
          {
2125
          /* Not a hex digit */
2126
60
          *errorcodeptr = ERR78;
2127
60
          break;
2128
60
          }
2129
723
        ptr++;
2130
723
        c = cc;
2131
2132
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2133
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2134
        strict, and there seems little incentive to align with that, given the
2135
        backwards-compatibility cost.
2136
2137
        For comparison, note that other engines disagree. For example:
2138
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2139
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2140
        */
2141
723
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2142
380
        ptr++;
2143
380
        c = (c << 4) | cc;
2144
380
        }     /* End of \xdd handling */
2145
2.20k
      }       /* End of Perl-style \x handling */
2146
1.94k
    break;
2147
2148
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2149
    ASCII (or Unicode) environment, an error is given if the character
2150
    following \c is not a printable ASCII character. Otherwise, the following
2151
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2152
    flipped. The result is the value of the escape.
2153
2154
    In an EBCDIC environment the handling of \c is compatible with the
2155
    specification in the perlebcdic document. The following character must be
2156
    a letter or one of small number of special characters. These provide a
2157
    means of defining the character values 0-31.
2158
2159
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2160
    the EBCDIC value of 'c' explicitly. */
2161
2162
1.94k
    case CHAR_c:
2163
616
    if (ptr >= ptrend)
2164
3
      {
2165
3
      *errorcodeptr = ERR2;
2166
3
      break;
2167
3
      }
2168
613
    c = *ptr;
2169
613
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2170
2171
    /* Handle \c in an ASCII/Unicode environment. */
2172
2173
613
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2174
613
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2175
23
      {
2176
23
      *errorcodeptr = ERR68;
2177
23
      goto ESCAPE_FAILED_FORWARD;
2178
23
      }
2179
590
    c ^= 0x40;
2180
2181
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2182
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2183
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2184
    The other valid sequences correspond to a list of specific characters. */
2185
2186
#else
2187
    if (c == CHAR_QUESTION_MARK)
2188
      c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;
2189
    else
2190
      {
2191
      for (i = 0; i < 32; i++)
2192
        {
2193
        if (c == ebcdic_escape_c[i]) break;
2194
        }
2195
      if (i < 32)
2196
        c = i;
2197
      else
2198
        {
2199
        *errorcodeptr = ERR68;
2200
        goto ESCAPE_FAILED_FORWARD;
2201
        }
2202
      }
2203
#endif  /* EBCDIC */
2204
2205
590
    ptr++;
2206
590
    break;
2207
2208
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2209
    if in warning mode, but PCRE doesn't have a warning mode. */
2210
2211
3
    default:
2212
3
    *errorcodeptr = ERR3;
2213
3
    break;
2214
26.3k
    }
2215
26.3k
  }
2216
2217
/* Set the pointer to the next character before returning. */
2218
2219
281k
EXIT:
2220
281k
*ptrptr = ptr;
2221
281k
*chptr = c;
2222
281k
return escape;
2223
2224
/* Some errors need to indicate the next character. */
2225
2226
220
ESCAPE_FAILED_FORWARD:
2227
220
ptr++;
2228
220
#ifdef SUPPORT_UNICODE
2229
220
if (utf) FORWARDCHARTEST(ptr, ptrend);
2230
220
#endif
2231
220
goto EXIT;
2232
281k
}
2233
2234
2235
2236
#ifdef SUPPORT_UNICODE
2237
/*************************************************
2238
*               Handle \P and \p                 *
2239
*************************************************/
2240
2241
/* This function is called after \P or \p has been encountered, provided that
2242
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2243
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2244
after the final code unit of the escape sequence.
2245
2246
Arguments:
2247
  ptrptr         the pattern position pointer
2248
  utf            true if the input is UTF-encoded
2249
  negptr         a boolean that is set TRUE for negation else FALSE
2250
  ptypeptr       an unsigned int that is set to the type value
2251
  pdataptr       an unsigned int that is set to the detailed property value
2252
  errorcodeptr   the error code variable
2253
  cb             the compile data
2254
2255
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2256
*/
2257
2258
static BOOL
2259
get_ucp(PCRE2_SPTR *ptrptr, BOOL utf, BOOL *negptr, uint16_t *ptypeptr,
2260
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2261
10.8k
{
2262
10.8k
uint32_t c;
2263
10.8k
ptrdiff_t i;
2264
10.8k
PCRE2_SIZE bot, top;
2265
10.8k
PCRE2_SPTR ptr = *ptrptr;
2266
10.8k
PCRE2_UCHAR name[50];
2267
10.8k
PCRE2_UCHAR *vptr = NULL;
2268
10.8k
uint16_t ptscript = PT_NOTSCRIPT;
2269
2270
#ifndef MAYBE_UTF_MULTI
2271
(void)utf;  /* Avoid compiler warning */
2272
#endif
2273
2274
10.8k
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2275
10.8k
GETCHARINCTEST(c, ptr);
2276
10.8k
*negptr = FALSE;
2277
2278
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2279
negation. We must be handling Unicode encoding here, though we may be compiling
2280
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2281
input and Unicode input in the same build.) In accordance with Unicode's "loose
2282
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2283
don't use isspace() or tolower() because (a) code points may be greater than
2284
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2285
environment. */
2286
2287
10.8k
if (c == CHAR_LEFT_CURLY_BRACKET)
2288
7.73k
  {
2289
7.73k
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2290
2291
29.3k
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2292
29.3k
    {
2293
31.4k
    REDO:
2294
2295
31.4k
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2296
31.3k
    GETCHARINCTEST(c, ptr);
2297
2298
    /* Skip ignorable Unicode characters. */
2299
2300
31.3k
    if (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2301
30.0k
        (c >= CHAR_HT && c <= CHAR_CR))
2302
1.58k
      {
2303
1.58k
      goto REDO;
2304
1.58k
      }
2305
2306
    /* The first significant character being circumflex negates the meaning of
2307
    the item. */
2308
2309
29.7k
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2310
531
      {
2311
531
      *negptr = TRUE;
2312
531
      goto REDO;
2313
531
      }
2314
2315
29.2k
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2316
2317
    /* Names consist of ASCII letters and digits, but equals and colon may also
2318
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2319
    check for a value between '&' and 'z' suffices because anything else in a
2320
    name or value will cause an "unknown property" error anyway. */
2321
2322
21.6k
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2323
2324
    /* Lower case a capital letter or remember where the name/value separator
2325
    is. */
2326
2327
21.5k
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2328
13.4k
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2329
1.62k
      vptr = name + i;
2330
2331
21.5k
    name[i] = c;
2332
21.5k
    }
2333
2334
  /* Error if the loop didn't end with '}' - either we hit the end of the
2335
  pattern or the name was longer than any legal property name. */
2336
2337
7.56k
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2338
7.56k
  name[i] = 0;
2339
7.56k
  }
2340
2341
/* If { doesn't follow \p or \P there is just one following character, which
2342
must be an ASCII letter. */
2343
2344
3.06k
else if (c >= CHAR_A && c <= CHAR_Z)
2345
1.65k
  {
2346
1.65k
  name[0] = c | 0x20;  /* Lower case */
2347
1.65k
  name[1] = 0;
2348
1.65k
  }
2349
1.41k
else if (c >= CHAR_a && c <= CHAR_z)
2350
1.36k
  {
2351
1.36k
  name[0] = c;
2352
1.36k
  name[1] = 0;
2353
1.36k
  }
2354
46
else goto ERROR_RETURN;
2355
2356
10.5k
*ptrptr = ptr;   /* Update pattern pointer */
2357
2358
/* If the property contains ':' or '=' we have class name and value separately
2359
specified. The following are supported:
2360
2361
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2362
  . Script (synonym sc) for which the property name is the script name
2363
  . Script_Extensions (synonym scx), ditto
2364
2365
As this is a small number, we currently just check the names directly. If this
2366
grows, a sorted table and a switch will be neater.
2367
2368
For both the script properties, set a PT_xxx value so that (1) they can be
2369
distinguished and (2) invalid script names that happen to be the name of
2370
another property can be diagnosed. */
2371
2372
10.5k
if (vptr != NULL)
2373
1.57k
  {
2374
1.57k
  int offset = 0;
2375
1.57k
  PCRE2_UCHAR sname[8];
2376
2377
1.57k
  *vptr = 0;   /* Terminate property name */
2378
1.57k
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2379
1.57k
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2380
881
    {
2381
881
    offset = 4;
2382
881
    sname[0] = CHAR_b;
2383
881
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2384
881
    sname[2] = CHAR_d;
2385
881
    sname[3] = CHAR_i;
2386
881
    }
2387
2388
698
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2389
698
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2390
225
    ptscript = PT_SC;
2391
2392
473
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2393
473
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2394
470
    ptscript = PT_SCX;
2395
2396
3
  else
2397
3
    {
2398
3
    *errorcodeptr = ERR47;
2399
3
    return FALSE;
2400
3
    }
2401
2402
  /* Adjust the string in name[] as needed */
2403
2404
1.57k
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2405
1.57k
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2406
1.57k
  }
2407
2408
/* Search for a recognized property using binary chop. */
2409
2410
10.5k
bot = 0;
2411
10.5k
top = PRIV(utt_size);
2412
2413
82.2k
while (bot < top)
2414
82.1k
  {
2415
82.1k
  int r;
2416
82.1k
  i = (bot + top) >> 1;
2417
82.1k
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2418
2419
  /* When a matching property is found, some extra checking is needed when the
2420
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2421
2422
82.1k
  if (r == 0)
2423
10.5k
    {
2424
10.5k
    *pdataptr = PRIV(utt)[i].value;
2425
10.5k
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2426
9.86k
      {
2427
9.86k
      *ptypeptr = PRIV(utt)[i].type;
2428
9.86k
      return TRUE;
2429
9.86k
      }
2430
2431
695
    switch (PRIV(utt)[i].type)
2432
695
      {
2433
86
      case PT_SC:
2434
86
      *ptypeptr = PT_SC;
2435
86
      return TRUE;
2436
2437
606
      case PT_SCX:
2438
606
      *ptypeptr = ptscript;
2439
606
      return TRUE;
2440
695
      }
2441
2442
3
    break;  /* Non-script found */
2443
695
    }
2444
2445
71.6k
  if (r > 0) bot = i + 1; else top = i;
2446
71.6k
  }
2447
2448
27
*errorcodeptr = ERR47;   /* Unrecognized property */
2449
27
return FALSE;
2450
2451
233
ERROR_RETURN:            /* Malformed \P or \p */
2452
233
*errorcodeptr = ERR46;
2453
233
*ptrptr = ptr;
2454
233
return FALSE;
2455
10.5k
}
2456
#endif
2457
2458
2459
2460
/*************************************************
2461
*           Check for POSIX class syntax         *
2462
*************************************************/
2463
2464
/* This function is called when the sequence "[:" or "[." or "[=" is
2465
encountered in a character class. It checks whether this is followed by a
2466
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2467
reach an unescaped ']' without the special preceding character, return FALSE.
2468
2469
Originally, this function only recognized a sequence of letters between the
2470
terminators, but it seems that Perl recognizes any sequence of characters,
2471
though of course unknown POSIX names are subsequently rejected. Perl gives an
2472
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2473
didn't consider this to be a POSIX class. Likewise for [:1234:].
2474
2475
The problem in trying to be exactly like Perl is in the handling of escapes. We
2476
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2477
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2478
below handles the special cases \\ and \], but does not try to do any other
2479
escape processing. This makes it different from Perl for cases such as
2480
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2481
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2482
when Perl does, I think.
2483
2484
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2485
It seems that the appearance of a nested POSIX class supersedes an apparent
2486
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2487
a digit. This is handled by returning FALSE if the start of a new group with
2488
the same terminator is encountered, since the next closing sequence must close
2489
the nested group, not the outer one.
2490
2491
In Perl, unescaped square brackets may also appear as part of class names. For
2492
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2493
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2494
seem right at all. PCRE does not allow closing square brackets in POSIX class
2495
names.
2496
2497
Arguments:
2498
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2499
  ptrend   pointer to the end of the pattern
2500
  endptr   where to return a pointer to the terminating ':', '.', or '='
2501
2502
Returns:   TRUE or FALSE
2503
*/
2504
2505
static BOOL
2506
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2507
23.7k
{
2508
23.7k
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2509
23.7k
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2510
2511
265k
for (; ptrend - ptr >= 2; ptr++)
2512
265k
  {
2513
265k
  if (*ptr == CHAR_BACKSLASH &&
2514
5.99k
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2515
453
    ptr++;
2516
2517
265k
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2518
262k
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2519
2520
256k
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2521
14.9k
    {
2522
14.9k
    *endptr = ptr;
2523
14.9k
    return TRUE;
2524
14.9k
    }
2525
265k
  }
2526
2527
287
return FALSE;
2528
23.7k
}
2529
2530
2531
2532
/*************************************************
2533
*          Check POSIX class name                *
2534
*************************************************/
2535
2536
/* This function is called to check the name given in a POSIX-style class entry
2537
such as [:alnum:].
2538
2539
Arguments:
2540
  ptr        points to the first letter
2541
  len        the length of the name
2542
2543
Returns:     a value representing the name, or -1 if unknown
2544
*/
2545
2546
static int
2547
check_posix_name(PCRE2_SPTR ptr, int len)
2548
14.8k
{
2549
14.8k
const char *pn = posix_names;
2550
14.8k
int yield = 0;
2551
139k
while (posix_name_lengths[yield] != 0)
2552
139k
  {
2553
139k
  if (len == posix_name_lengths[yield] &&
2554
80.0k
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2555
124k
  pn += posix_name_lengths[yield] + 1;
2556
124k
  yield++;
2557
124k
  }
2558
46
return -1;
2559
14.8k
}
2560
2561
2562
2563
/*************************************************
2564
*       Read a subpattern or VERB name           *
2565
*************************************************/
2566
2567
/* This function is called from parse_regex() below whenever it needs to read
2568
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2569
pointer must be to the preceding character. If that character is '*' we are
2570
reading a verb or alpha assertion name. The pointer is updated to point after
2571
the name, for a VERB or alpha assertion name, or after the name's terminator
2572
for a subpattern name. Returning both the offset and the name pointer is
2573
redundant information, but some callers use one and some the other, so it is
2574
simplest just to return both. When the name is in braces, spaces and tabs are
2575
allowed (and ignored) at either end.
2576
2577
Arguments:
2578
  ptrptr      points to the character pointer variable
2579
  ptrend      points to the end of the input string
2580
  utf         true if the input is UTF-encoded
2581
  terminator  the terminator of a subpattern name must be this
2582
  offsetptr   where to put the offset from the start of the pattern
2583
  nameptr     where to put a pointer to the name in the input
2584
  namelenptr  where to put the length of the name
2585
  errcodeptr  where to put an error code
2586
  cb          pointer to the compile data block
2587
2588
Returns:    TRUE if a name was read
2589
            FALSE otherwise, with error code set
2590
*/
2591
2592
static BOOL
2593
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2594
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2595
  int *errorcodeptr, compile_block *cb)
2596
52.6k
{
2597
52.6k
PCRE2_SPTR ptr = *ptrptr;
2598
52.6k
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2599
52.6k
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2600
2601
52.6k
if (is_braced)
2602
1.67k
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2603
2604
52.6k
if (ptr >= ptrend)                 /* No characters in name */
2605
79
  {
2606
79
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2607
79
                            ERR60; /* Verb not recognized or malformed */
2608
79
  goto FAILED;
2609
79
  }
2610
2611
52.6k
*nameptr = ptr;
2612
52.6k
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2613
2614
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2615
ought to be updated to match. */
2616
2617
/* In UTF mode, a group name may contain letters and decimal digits as defined
2618
by Unicode properties, and underscores, but must not start with a digit. */
2619
2620
52.6k
#ifdef SUPPORT_UNICODE
2621
52.6k
if (utf && is_group)
2622
8.78k
  {
2623
8.78k
  uint32_t c, type;
2624
8.78k
  PCRE2_SPTR p = ptr;
2625
2626
8.78k
  GETCHARINC(c, p);  /* Peek at next character */
2627
8.78k
  type = UCD_CHARTYPE(c);
2628
2629
8.78k
  if (type == ucp_Nd)
2630
1
    {
2631
1
    ptr = p;
2632
1
    *errorcodeptr = ERR44;
2633
1
    goto FAILED;
2634
1
    }
2635
2636
8.78k
  for(;;)
2637
20.9k
    {
2638
20.9k
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2639
8.77k
        c != CHAR_UNDERSCORE) break;
2640
12.2k
    ptr = p;  /* Accept character and peek again */
2641
12.2k
    if (p >= ptrend) break;
2642
12.1k
    GETCHARINC(c, p);
2643
12.1k
    type = UCD_CHARTYPE(c);
2644
12.1k
    }
2645
8.78k
  }
2646
43.8k
else
2647
#else
2648
(void)utf;  /* Avoid compiler warning */
2649
#endif      /* SUPPORT_UNICODE */
2650
2651
/* Handle non-group names and group names in non-UTF modes. A group name must
2652
not start with a digit. If either of the others start with a digit it just
2653
won't be recognized. */
2654
2655
43.8k
  {
2656
43.8k
  if (is_group && IS_DIGIT(*ptr))
2657
3
    {
2658
3
    ++ptr;
2659
3
    *errorcodeptr = ERR44;
2660
3
    goto FAILED;
2661
3
    }
2662
2663
172k
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2664
128k
    {
2665
128k
    ptr++;
2666
128k
    }
2667
43.8k
  }
2668
2669
/* Check name length */
2670
2671
52.6k
if (ptr - *nameptr > MAX_NAME_SIZE)
2672
9
  {
2673
9
  *errorcodeptr = ERR48;
2674
9
  goto FAILED;
2675
9
  }
2676
52.5k
*namelenptr = (uint32_t)(ptr - *nameptr);
2677
2678
/* Subpattern names must not be empty, and their terminator is checked here.
2679
(What follows a verb or alpha assertion name is checked separately.) */
2680
2681
52.5k
if (is_group)
2682
26.2k
  {
2683
26.2k
  if (ptr == *nameptr)
2684
282
    {
2685
282
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2686
282
    goto FAILED;
2687
282
    }
2688
25.9k
  if (is_braced)
2689
1.79k
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2690
25.9k
  if (terminator != 0)
2691
25.2k
    {
2692
25.2k
    if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2693
105
      {
2694
105
      *errorcodeptr = ERR42;
2695
105
      goto FAILED;
2696
105
      }
2697
25.1k
    ptr++;
2698
25.1k
    }
2699
25.9k
  }
2700
2701
52.2k
*ptrptr = ptr;
2702
52.2k
return TRUE;
2703
2704
479
FAILED:
2705
479
*ptrptr = ptr;
2706
479
return FALSE;
2707
52.5k
}
2708
2709
2710
2711
/**************************************************
2712
*        Parse capturing bracket argument list    *
2713
**************************************************/
2714
2715
/* Reads a list of capture references. The references
2716
can be numbers or names.
2717
2718
Arguments:
2719
  ptrptr           points to the character pointer variable
2720
  ptrend           points to the end of the input string
2721
  utf              true if the input is UTF-encoded
2722
  parsed_pattern   the parsed pattern pointer
2723
  offset           last known offset
2724
  errcodeptr       where to put an error code
2725
  cb               pointer to the compile data block
2726
2727
Returns: updated parsed_pattern pointer on success
2728
         NULL otherwise
2729
*/
2730
2731
static uint32_t *
2732
parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
2733
  BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,
2734
  int *errorcodeptr, compile_block *cb)
2735
9.62k
{
2736
9.62k
PCRE2_SIZE next_offset;
2737
9.62k
PCRE2_SPTR ptr = *ptrptr;
2738
9.62k
PCRE2_SPTR name;
2739
9.62k
PCRE2_UCHAR terminator;
2740
9.62k
uint32_t meta, namelen;
2741
9.62k
int i;
2742
2743
9.62k
if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)
2744
10
  {
2745
10
  *errorcodeptr = ERR118;
2746
10
  goto FAILED;
2747
10
  }
2748
2749
9.61k
for (;;)
2750
9.67k
  {
2751
9.67k
  ptr++;
2752
9.67k
  next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
2753
2754
9.67k
  if (ptr >= ptrend)
2755
10
    {
2756
10
    *errorcodeptr = ERR117;
2757
10
    goto FAILED;
2758
10
    }
2759
2760
  /* Handle [+-]number cases */
2761
9.66k
  if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
2762
9.66k
      &i, errorcodeptr))
2763
7.36k
    {
2764
7.36k
    PCRE2_ASSERT(i >= 0);
2765
7.36k
    if (i <= 0)
2766
3
      {
2767
3
      *errorcodeptr = ERR15;
2768
3
      goto FAILED;
2769
3
      }
2770
7.35k
    meta = META_CAPTURE_NUMBER;
2771
7.35k
    namelen = (uint32_t)i;
2772
7.35k
    }
2773
2.30k
  else if (*errorcodeptr != 0) goto FAILED; /* Number too big */
2774
2.27k
  else
2775
2.27k
    {
2776
    /* Handle 'name' or <name> cases. */
2777
2.27k
    if (*ptr == CHAR_LESS_THAN_SIGN)
2778
2.24k
      terminator = CHAR_GREATER_THAN_SIGN;
2779
30
    else if (*ptr == CHAR_APOSTROPHE)
2780
3
      terminator = CHAR_APOSTROPHE;
2781
27
    else
2782
27
      {
2783
27
      *errorcodeptr = ERR117;
2784
27
      goto FAILED;
2785
27
      }
2786
2787
2.24k
    if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
2788
2.24k
        &name, &namelen, errorcodeptr, cb)) goto FAILED;
2789
2790
2.23k
    meta = META_CAPTURE_NAME;
2791
2.23k
    }
2792
2793
9.59k
  PCRE2_ASSERT(next_offset > 0);
2794
9.59k
  if (offset == 0 || (next_offset - offset) >= 0x10000)
2795
776
    {
2796
776
    *parsed_pattern++ = META_OFFSET;
2797
776
    PUTOFFSET(next_offset, parsed_pattern);
2798
776
    offset = next_offset;
2799
776
    }
2800
2801
  /* The offset is encoded as a relative offset, because for some
2802
  inputs such as ",2" in (1,2,3), we only have space for two uint32_t
2803
  values, and an opcode and absolute offset may require three uint32_t
2804
  values. */
2805
9.59k
  *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
2806
9.59k
  *parsed_pattern++ = namelen;
2807
9.59k
  offset = next_offset;
2808
2809
9.59k
  if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
2810
2811
9.56k
  if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
2812
2813
74
  if (*ptr != CHAR_COMMA)
2814
14
    {
2815
14
    *errorcodeptr = ERR24;
2816
14
    goto FAILED;
2817
14
    }
2818
74
  }
2819
2820
9.48k
*ptrptr = ptr + 1;
2821
9.48k
return parsed_pattern;
2822
2823
30
UNCLOSED_PARENTHESIS:
2824
30
*errorcodeptr = ERR14;
2825
2826
136
FAILED:
2827
136
*ptrptr = ptr;
2828
136
return NULL;
2829
30
}
2830
2831
2832
2833
/*************************************************
2834
*          Manage callouts at start of cycle     *
2835
*************************************************/
2836
2837
/* At the start of a new item in parse_regex() we are able to record the
2838
details of the previous item in a prior callout, and also to set up an
2839
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2840
which would otherwise happen for items such as \Q that contribute nothing to
2841
the parsed pattern.
2842
2843
Arguments:
2844
  ptr              current pattern pointer
2845
  pcalloutptr      points to a pointer to previous callout, or NULL
2846
  auto_callout     TRUE if auto_callouts are enabled
2847
  parsed_pattern   the parsed pattern pointer
2848
  cb               compile block
2849
2850
Returns: possibly updated parsed_pattern pointer.
2851
*/
2852
2853
static uint32_t *
2854
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2855
  uint32_t *parsed_pattern, compile_block *cb)
2856
7.07M
{
2857
7.07M
uint32_t *previous_callout = *pcalloutptr;
2858
2859
7.07M
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2860
797k
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2861
2862
7.07M
if (!auto_callout) previous_callout = NULL; else
2863
810k
  {
2864
810k
  if (previous_callout == NULL ||
2865
795k
      previous_callout != parsed_pattern - 4 ||
2866
1.30k
      previous_callout[3] != 255)
2867
809k
    {
2868
809k
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2869
809k
    parsed_pattern += 4;
2870
809k
    previous_callout[0] = META_CALLOUT_NUMBER;
2871
809k
    previous_callout[2] = 0;
2872
809k
    previous_callout[3] = 255;
2873
809k
    }
2874
810k
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2875
810k
  }
2876
2877
7.07M
*pcalloutptr = previous_callout;
2878
7.07M
return parsed_pattern;
2879
7.07M
}
2880
2881
2882
2883
/*************************************************
2884
*          Handle \d, \D, \s, \S, \w, \W         *
2885
*************************************************/
2886
2887
/* This function is called from parse_regex() below, both for freestanding
2888
escapes, and those within classes, to handle those escapes that may change when
2889
Unicode property support is requested. Note that PCRE2_UCP will never be set
2890
without Unicode support because that is checked when pcre2_compile() is called.
2891
2892
Arguments:
2893
  escape          the ESC_... value
2894
  parsed_pattern  where to add the code
2895
  options         options bits
2896
  xoptions        extra options bits
2897
2898
Returns:          updated value of parsed_pattern
2899
*/
2900
static uint32_t *
2901
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2902
  uint32_t xoptions)
2903
85.4k
{
2904
85.4k
uint32_t ascii_option = 0;
2905
85.4k
uint32_t prop = ESC_p;
2906
2907
85.4k
switch(escape)
2908
85.4k
  {
2909
20.1k
  case ESC_D:
2910
20.1k
  prop = ESC_P;
2911
20.1k
  PCRE2_FALLTHROUGH /* Fall through */
2912
31.1k
  case ESC_d:
2913
31.1k
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2914
31.1k
  break;
2915
2916
20.4k
  case ESC_S:
2917
20.4k
  prop = ESC_P;
2918
20.4k
  PCRE2_FALLTHROUGH /* Fall through */
2919
30.4k
  case ESC_s:
2920
30.4k
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2921
30.4k
  break;
2922
2923
11.8k
  case ESC_W:
2924
11.8k
  prop = ESC_P;
2925
11.8k
  PCRE2_FALLTHROUGH /* Fall through */
2926
23.8k
  case ESC_w:
2927
23.8k
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2928
23.8k
  break;
2929
85.4k
  }
2930
2931
85.4k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2932
61.8k
  {
2933
61.8k
  *parsed_pattern++ = META_ESCAPE + escape;
2934
61.8k
  }
2935
23.6k
else
2936
23.6k
  {
2937
23.6k
  *parsed_pattern++ = META_ESCAPE + prop;
2938
23.6k
  switch(escape)
2939
23.6k
    {
2940
3.47k
    case ESC_d:
2941
7.31k
    case ESC_D:
2942
7.31k
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2943
7.31k
    break;
2944
2945
2.68k
    case ESC_s:
2946
9.36k
    case ESC_S:
2947
9.36k
    *parsed_pattern++ = PT_SPACE << 16;
2948
9.36k
    break;
2949
2950
3.82k
    case ESC_w:
2951
6.92k
    case ESC_W:
2952
6.92k
    *parsed_pattern++ = PT_WORD << 16;
2953
6.92k
    break;
2954
23.6k
    }
2955
23.6k
  }
2956
2957
85.4k
return parsed_pattern;
2958
85.4k
}
2959
2960
2961
2962
/*************************************************
2963
* Maximum size of parsed_pattern for given input *
2964
*************************************************/
2965
2966
/* This function is called from parse_regex() below, to determine the amount
2967
of memory to allocate for parsed_pattern. It is also called to check whether
2968
the amount of data written respects the amount of memory allocated.
2969
2970
Arguments:
2971
  ptr             points to the start of the pattern
2972
  ptrend          points to the end of the pattern
2973
  utf             TRUE in UTF mode
2974
  options         the options bits
2975
2976
Returns:          the number of uint32_t units for parsed_pattern
2977
*/
2978
static ptrdiff_t
2979
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2980
  uint32_t options)
2981
82.1k
{
2982
82.1k
PCRE2_SIZE big32count = 0;
2983
82.1k
ptrdiff_t parsed_size_needed;
2984
2985
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2986
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2987
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2988
when literal characters greater than META_END (0x80000000) have to be coded as
2989
two units. In this case, therefore, we scan the pattern to check for such
2990
values. */
2991
2992
#if PCRE2_CODE_UNIT_WIDTH == 32
2993
if (!utf)
2994
  {
2995
  PCRE2_SPTR p;
2996
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2997
  }
2998
#else
2999
82.1k
(void)utf;  /* Avoid compiler warning */
3000
82.1k
#endif
3001
3002
82.1k
parsed_size_needed = (ptrend - ptr) + big32count;
3003
3004
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
3005
elements) for each character. This is overkill, but memory is plentiful these
3006
days. */
3007
3008
82.1k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
3009
15.4k
  parsed_size_needed += (ptrend - ptr) * 4;
3010
3011
82.1k
return parsed_size_needed;
3012
82.1k
}
3013
3014
3015
3016
/*************************************************
3017
*      Parse regex and identify named groups     *
3018
*************************************************/
3019
3020
/* This function is called first of all. It scans the pattern and does two
3021
things: (1) It identifies capturing groups and makes a table of named capturing
3022
groups so that information about them is fully available to both the compiling
3023
scans. (2) It writes a parsed version of the pattern with comments omitted and
3024
escapes processed into the parsed_pattern vector.
3025
3026
Arguments:
3027
  ptr             points to the start of the pattern
3028
  options         compiling dynamic options (may change during the scan)
3029
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
3030
  cb              pointer to the compile data block
3031
3032
Returns:   zero on success or a non-zero error code, with the
3033
             error offset placed in the cb field
3034
*/
3035
3036
/* A structure and some flags for dealing with nested groups. */
3037
3038
typedef struct nest_save {
3039
  uint16_t  nest_depth;
3040
  uint16_t  reset_group;
3041
  uint16_t  max_group;
3042
  uint16_t  flags;
3043
  uint32_t  options;
3044
  uint32_t  xoptions;
3045
} nest_save;
3046
3047
23.4k
#define NSF_RESET          0x0001u
3048
19.5k
#define NSF_CONDASSERT     0x0002u
3049
12.5k
#define NSF_ATOMICSR       0x0004u
3050
3051
/* Options that are changeable within the pattern must be tracked during
3052
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
3053
but all must be tracked so that META_OPTIONS items set the correct values for
3054
the main compiling phase. */
3055
3056
30.6k
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
3057
30.6k
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
3058
30.6k
  PCRE2_UNGREEDY)
3059
3060
30.6k
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
3061
30.6k
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
3062
30.6k
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
3063
3064
/* States used for analyzing ranges in character classes. The two OK values
3065
must be last. */
3066
3067
enum {
3068
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
3069
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
3070
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
3071
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
3072
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
3073
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
3074
};
3075
3076
/* States used for analyzing operators and operands in extended character
3077
classes. */
3078
3079
enum {
3080
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
3081
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
3082
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
3083
};
3084
3085
/* States used for determining the parse mode in character classes. The two
3086
PERL_EXT values must be last. */
3087
3088
enum {
3089
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
3090
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
3091
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
3092
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
3093
};
3094
3095
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
3096
the storing of literal values in the main parsed pattern, where they can always
3097
be quantified. */
3098
3099
#if PCRE2_CODE_UNIT_WIDTH == 32
3100
#define PARSED_LITERAL(c, p) \
3101
  { \
3102
  if (c >= META_END) *p++ = META_BIGVALUE; \
3103
  *p++ = c; \
3104
  okquantifier = TRUE; \
3105
  }
3106
#else
3107
7.96M
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
3108
#endif
3109
3110
/* Here's the actual function. */
3111
3112
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
3113
  BOOL *has_lookbehind, compile_block *cb)
3114
82.1k
{
3115
82.1k
uint32_t c;
3116
82.1k
uint32_t delimiter;
3117
82.1k
uint32_t namelen;
3118
82.1k
uint32_t class_range_state;
3119
82.1k
uint32_t class_op_state;
3120
82.1k
uint32_t class_mode_state;
3121
82.1k
uint32_t *class_start;
3122
82.1k
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
3123
82.1k
uint32_t *verbstartptr = NULL;
3124
82.1k
uint32_t *previous_callout = NULL;
3125
82.1k
uint32_t *parsed_pattern = cb->parsed_pattern;
3126
82.1k
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
3127
82.1k
uint32_t *this_parsed_item = NULL;
3128
82.1k
uint32_t *prev_parsed_item = NULL;
3129
82.1k
uint32_t meta_quantifier = 0;
3130
82.1k
uint32_t add_after_mark = 0;
3131
82.1k
uint16_t nest_depth = 0;
3132
82.1k
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
3133
82.1k
int16_t class_maxdepth_m1 = -1;
3134
82.1k
uint16_t hash;
3135
82.1k
int after_manual_callout = 0;
3136
82.1k
int expect_cond_assert = 0;
3137
82.1k
int errorcode = 0;
3138
82.1k
int escape;
3139
82.1k
int i;
3140
82.1k
BOOL inescq = FALSE;
3141
82.1k
BOOL inverbname = FALSE;
3142
82.1k
BOOL utf = (options & PCRE2_UTF) != 0;
3143
82.1k
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
3144
82.1k
BOOL is_dupname;
3145
82.1k
BOOL negate_class;
3146
82.1k
BOOL okquantifier = FALSE;
3147
82.1k
PCRE2_SPTR thisptr;
3148
82.1k
PCRE2_SPTR name;
3149
82.1k
PCRE2_SPTR ptrend = cb->end_pattern;
3150
82.1k
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
3151
82.1k
PCRE2_SPTR class_range_forbid_ptr = NULL;
3152
82.1k
named_group *ng;
3153
82.1k
nest_save *top_nest, *end_nests;
3154
#ifdef PCRE2_DEBUG
3155
uint32_t *parsed_pattern_check;
3156
ptrdiff_t parsed_pattern_extra = 0;
3157
ptrdiff_t parsed_pattern_extra_check = 0;
3158
PCRE2_SPTR ptr_check;
3159
#endif
3160
3161
82.1k
PCRE2_ASSERT(parsed_pattern != NULL);
3162
3163
/* Insert leading items for word and line matching (features provided for the
3164
benefit of pcre2grep). */
3165
3166
82.1k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
3167
0
  {
3168
0
  *parsed_pattern++ = META_CIRCUMFLEX;
3169
0
  *parsed_pattern++ = META_NOCAPTURE;
3170
0
  }
3171
82.1k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
3172
0
  {
3173
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
3174
0
  *parsed_pattern++ = META_NOCAPTURE;
3175
0
  }
3176
3177
#ifdef PCRE2_DEBUG
3178
parsed_pattern_check = parsed_pattern;
3179
ptr_check = ptr;
3180
#endif
3181
3182
/* If the pattern is actually a literal string, process it separately to avoid
3183
cluttering up the main loop. */
3184
3185
82.1k
if ((options & PCRE2_LITERAL) != 0)
3186
0
  {
3187
0
  while (ptr < ptrend)
3188
0
    {
3189
    /* LCOV_EXCL_START */
3190
0
    if (parsed_pattern >= parsed_pattern_end)
3191
0
      {
3192
0
      PCRE2_DEBUG_UNREACHABLE();
3193
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3194
0
      goto FAILED;
3195
0
      }
3196
    /* LCOV_EXCL_STOP */
3197
3198
0
    thisptr = ptr;
3199
0
    GETCHARINCTEST(c, ptr);
3200
0
    if (auto_callout)
3201
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3202
0
        auto_callout, parsed_pattern, cb);
3203
0
    PARSED_LITERAL(c, parsed_pattern);
3204
0
    }
3205
0
  goto PARSED_END;
3206
0
  }
3207
3208
/* Process a real regex which may contain meta-characters. */
3209
3210
82.1k
top_nest = NULL;
3211
82.1k
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3212
3213
/* The size of the nest_save structure might not be a factor of the size of the
3214
workspace. Therefore we must round down end_nests so as to correctly avoid
3215
creating a nest_save that spans the end of the workspace. */
3216
3217
82.1k
end_nests = (nest_save *)((char *)end_nests -
3218
82.1k
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3219
3220
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3221
3222
82.1k
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3223
3224
/* Now scan the pattern */
3225
3226
7.73M
while (ptr < ptrend)
3227
7.65M
  {
3228
7.65M
  int prev_expect_cond_assert;
3229
7.65M
  uint32_t min_repeat = 0, max_repeat = 0;
3230
7.65M
  uint32_t set, unset, *optset;
3231
7.65M
  uint32_t xset, xunset, *xoptset;
3232
7.65M
  uint32_t terminator;
3233
7.65M
  uint32_t prev_meta_quantifier;
3234
7.65M
  BOOL prev_okquantifier;
3235
7.65M
  PCRE2_SPTR tempptr;
3236
7.65M
  PCRE2_SIZE offset;
3237
3238
7.65M
  if (nest_depth > cb->cx->parens_nest_limit)
3239
3
    {
3240
3
    errorcode = ERR19;
3241
3
    goto FAILED;        /* Parentheses too deeply nested */
3242
3
    }
3243
3244
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3245
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3246
  write a little bit too much, everything will appear to be OK, because the
3247
  upfront size is an overestimate... but a malicious pattern could end up
3248
  forcing a write past the buffer end. We must catch this during
3249
  development. */
3250
3251
#ifdef PCRE2_DEBUG
3252
  /* Strong post-write check. Won't help in release builds - at this point
3253
  the write has already occurred so it's too late. However, should stop us
3254
  committing unsafe code. */
3255
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3256
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3257
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3258
  parsed_pattern_check = parsed_pattern;
3259
  parsed_pattern_extra_check = parsed_pattern_extra;
3260
  ptr_check = ptr;
3261
#endif
3262
3263
  /* LCOV_EXCL_START */
3264
7.65M
  if (parsed_pattern >= parsed_pattern_end)
3265
0
    {
3266
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3267
    (but the code below can write many chars). Better than nothing. */
3268
0
    PCRE2_DEBUG_UNREACHABLE();
3269
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3270
0
    goto FAILED;
3271
0
    }
3272
  /* LCOV_EXCL_STOP */
3273
3274
  /* If the last time round this loop something was added, parsed_pattern will
3275
  no longer be equal to this_parsed_item. Remember where the previous item
3276
  started and reset for the next item. Note that sometimes round the loop,
3277
  nothing gets added (e.g. for ignored white space). */
3278
3279
7.65M
  if (this_parsed_item != parsed_pattern)
3280
7.54M
    {
3281
7.54M
    prev_parsed_item = this_parsed_item;
3282
7.54M
    this_parsed_item = parsed_pattern;
3283
7.54M
    }
3284
3285
  /* Get next input character, save its position for callout handling. */
3286
3287
7.65M
  thisptr = ptr;
3288
7.65M
  GETCHARINCTEST(c, ptr);
3289
3290
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3291
  callouts, except when processing a (*VERB) "name".  */
3292
3293
7.65M
  if (inescq)
3294
43.6k
    {
3295
43.6k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3296
1.38k
      {
3297
1.38k
      inescq = FALSE;
3298
1.38k
      ptr++;   /* Skip E */
3299
1.38k
      }
3300
42.2k
    else
3301
42.2k
      {
3302
42.2k
      if (inverbname)
3303
930
        {                          /* Don't use PARSED_LITERAL() because it */
3304
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3305
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3306
#endif
3307
930
        *parsed_pattern++ = c;
3308
930
        }
3309
41.3k
      else
3310
41.3k
        {
3311
41.3k
        if (after_manual_callout-- <= 0)
3312
40.7k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3313
40.7k
            auto_callout, parsed_pattern, cb);
3314
41.3k
        PARSED_LITERAL(c, parsed_pattern);
3315
41.3k
        }
3316
42.2k
      meta_quantifier = 0;
3317
42.2k
      }
3318
43.6k
    continue;  /* Next character */
3319
43.6k
    }
3320
3321
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3322
  characters up to the closing parenthesis are literals except when
3323
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3324
  and \E and escaped characters are allowed (no character types such as \d). If
3325
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3326
  this by not entering the special (*VERB:NAME) processing - they are then
3327
  picked up below. Note that c is a character, not a code unit, so we must not
3328
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3329
  TRUE in 8-bit mode. */
3330
3331
7.61M
  if (inverbname &&
3332
177k
       (
3333
        /* EITHER: not both options set */
3334
177k
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3335
177k
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3336
12.2k
#ifdef SUPPORT_UNICODE
3337
        /* OR: character > 255 AND not Unicode Pattern White Space */
3338
12.2k
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3339
5.71k
#endif
3340
        /* OR: not a # comment or isspace() white space */
3341
5.71k
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3342
4.62k
#ifdef SUPPORT_UNICODE
3343
        /* and not CHAR_NEL when Unicode is supported */
3344
4.62k
          && c != CHAR_NEL
3345
5.71k
#endif
3346
5.71k
       )))
3347
176k
    {
3348
176k
    PCRE2_SIZE verbnamelength;
3349
3350
176k
    switch(c)
3351
176k
      {
3352
164k
      default:                     /* Don't use PARSED_LITERAL() because it */
3353
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3354
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3355
#endif
3356
164k
      *parsed_pattern++ = c;
3357
164k
      break;
3358
3359
6.83k
      case CHAR_RIGHT_PARENTHESIS:
3360
6.83k
      inverbname = FALSE;
3361
      /* This is the length in characters */
3362
6.83k
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3363
      /* But the limit on the length is in code units */
3364
6.83k
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3365
0
        {
3366
0
        ptr--;
3367
0
        errorcode = ERR76;
3368
0
        goto FAILED;
3369
0
        }
3370
6.83k
      *verblengthptr = (uint32_t)verbnamelength;
3371
3372
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3373
      a (*MARK) was generated for the name. We now add the original verb as the
3374
      next item. */
3375
3376
6.83k
      if (add_after_mark != 0)
3377
635
        {
3378
635
        *parsed_pattern++ = add_after_mark;
3379
635
        add_after_mark = 0;
3380
635
        }
3381
6.83k
      break;
3382
3383
5.49k
      case CHAR_BACKSLASH:
3384
5.49k
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3385
803
        {
3386
803
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3387
803
          xoptions, cb->bracount, FALSE, cb);
3388
803
        if (errorcode != 0) goto FAILED;
3389
803
        }
3390
4.68k
      else escape = 0;   /* Treat all as literal */
3391
3392
5.48k
      switch(escape)
3393
5.48k
        {
3394
5.05k
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3395
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3396
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3397
#endif
3398
5.05k
        *parsed_pattern++ = c;
3399
5.05k
        break;
3400
3401
0
        case ESC_ub:
3402
0
        *parsed_pattern++ = CHAR_u;
3403
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3404
0
        break;
3405
3406
204
        case ESC_Q:
3407
204
        inescq = TRUE;
3408
204
        break;
3409
3410
202
        case ESC_E:           /* Ignore */
3411
202
        break;
3412
3413
26
        default:
3414
26
        errorcode = ERR40;    /* Invalid in verb name */
3415
26
        goto FAILED;
3416
5.48k
        }
3417
176k
      }
3418
176k
    continue;   /* Next character in pattern */
3419
176k
    }
3420
3421
  /* Not a verb name character. At this point we must process everything that
3422
  must not change the quantification state. This is mainly comments, but we
3423
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3424
  A+, as in Perl. An isolated \E is ignored. */
3425
3426
7.43M
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3427
222k
    {
3428
222k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3429
1.65k
      {
3430
      /* A literal inside a \Q...\E is not allowed if we are expecting a
3431
      conditional assertion, but an empty \Q\E sequence is OK. */
3432
1.65k
      if (expect_cond_assert > 0 && *ptr == CHAR_Q &&
3433
52
          !(ptrend - ptr >= 3 && ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E))
3434
17
        {
3435
17
        ptr--;
3436
17
        errorcode = ERR28;
3437
17
        goto FAILED;
3438
17
        }
3439
1.63k
      inescq = *ptr == CHAR_Q;
3440
1.63k
      ptr++;
3441
1.63k
      continue;
3442
1.65k
      }
3443
222k
    }
3444
3445
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3446
  character, not a code unit, so we must not use MAX_255 to test its size
3447
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3448
  whitespace characters are those designated as "Pattern White Space" by
3449
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3450
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3451
  subset of space characters that match \h and \v. */
3452
3453
7.43M
  if ((options & PCRE2_EXTENDED) != 0)
3454
2.11M
    {
3455
2.11M
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3456
2.09M
#ifdef SUPPORT_UNICODE
3457
2.09M
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3458
2.09M
#endif
3459
2.09M
    if (c == CHAR_NUMBER_SIGN)
3460
1.11k
      {
3461
41.8k
      while (ptr < ptrend)
3462
41.3k
        {
3463
41.3k
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3464
703
          {                       /* IS_NEWLINE sets cb->nllen. */
3465
703
          ptr += cb->nllen;
3466
703
          break;
3467
703
          }
3468
40.6k
        ptr++;
3469
40.6k
#ifdef SUPPORT_UNICODE
3470
40.6k
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3471
40.6k
#endif
3472
40.6k
        }
3473
1.11k
      continue;  /* Next character in pattern */
3474
1.11k
      }
3475
2.09M
    }
3476
3477
  /* Skip over bracketed comments */
3478
3479
7.42M
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3480
278k
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3481
218
    {
3482
872
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3483
218
    if (ptr >= ptrend)
3484
14
      {
3485
14
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3486
14
      goto FAILED;        /* to make it easier to debug. */
3487
14
      }
3488
204
    ptr++;
3489
204
    continue;  /* Next character in pattern */
3490
218
    }
3491
3492
  /* If the next item is not a quantifier, fill in length of any previous
3493
  callout and create an auto callout if required. */
3494
3495
7.42M
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3496
7.01M
       (c != CHAR_LEFT_CURLY_BRACKET ||
3497
80.7k
         (tempptr = ptr,
3498
80.7k
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3499
6.96M
    {
3500
6.96M
    if (after_manual_callout-- <= 0)
3501
6.95M
      {
3502
6.95M
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3503
6.95M
        parsed_pattern, cb);
3504
6.95M
      this_parsed_item = parsed_pattern;  /* New start for current item */
3505
6.95M
      }
3506
6.96M
    }
3507
3508
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3509
  assertion, possibly preceded by a callout. If the value is 1, we have just
3510
  had the callout and expect an assertion. There must be at least 3 more
3511
  characters in all cases. When expect_cond_assert is 2, we know that the
3512
  current character is an opening parenthesis, as otherwise we wouldn't be
3513
  here. However, when it is 1, we need to check, and it's easiest just to check
3514
  always. Note that expect_cond_assert may be negative, since all callouts just
3515
  decrement it. */
3516
3517
7.42M
  if (expect_cond_assert > 0)
3518
7.96k
    {
3519
7.96k
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3520
7.93k
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3521
7.96k
    if (ok)
3522
7.93k
      {
3523
7.93k
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3524
317
        {
3525
317
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3526
317
        }
3527
7.61k
      else switch(ptr[1])  /* Traditional symbolic format */
3528
7.61k
        {
3529
130
        case CHAR_C:
3530
130
        ok = expect_cond_assert == 2;
3531
130
        break;
3532
3533
2.91k
        case CHAR_EQUALS_SIGN:
3534
6.30k
        case CHAR_EXCLAMATION_MARK:
3535
6.30k
        break;
3536
3537
1.18k
        case CHAR_LESS_THAN_SIGN:
3538
1.18k
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3539
1.18k
        break;
3540
3541
5
        default:
3542
5
        ok = FALSE;
3543
7.61k
        }
3544
7.93k
      }
3545
3546
7.96k
    if (!ok)
3547
48
      {
3548
48
      errorcode = ERR28;
3549
48
      if (expect_cond_assert == 2) goto FAILED;
3550
19
      goto FAILED_BACK;
3551
48
      }
3552
7.96k
    }
3553
3554
  /* Remember whether we are expecting a conditional assertion, and set the
3555
  default for this item. */
3556
3557
7.42M
  prev_expect_cond_assert = expect_cond_assert;
3558
7.42M
  expect_cond_assert = 0;
3559
3560
  /* Remember quantification status for the previous significant item, then set
3561
  default for this item. */
3562
3563
7.42M
  prev_okquantifier = okquantifier;
3564
7.42M
  prev_meta_quantifier = meta_quantifier;
3565
7.42M
  okquantifier = FALSE;
3566
7.42M
  meta_quantifier = 0;
3567
3568
  /* If the previous significant item was a quantifier, adjust the parsed code
3569
  if there is a following modifier. The base meta value is always followed by
3570
  the PLUS and QUERY values, in that order. We do this here rather than after
3571
  reading a quantifier so that intervening comments and /x whitespace can be
3572
  ignored without having to replicate code. */
3573
3574
7.42M
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3575
90.5k
    {
3576
90.5k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3577
90.5k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3578
83.1k
        0x00020000u : 0x00010000u);
3579
90.5k
    continue;  /* Next character in pattern */
3580
90.5k
    }
3581
3582
  /* Process the next item in the main part of a pattern. */
3583
3584
7.32M
  switch(c)
3585
7.32M
    {
3586
5.74M
    default:              /* Non-special character */
3587
5.74M
    PARSED_LITERAL(c, parsed_pattern);
3588
5.74M
    break;
3589
3590
3591
    /* ---- Escape sequence ---- */
3592
3593
220k
    case CHAR_BACKSLASH:
3594
220k
    tempptr = ptr;
3595
220k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3596
220k
      xoptions, cb->bracount, FALSE, cb);
3597
220k
    if (errorcode != 0)
3598
846
      {
3599
1.24k
      ESCAPE_FAILED:
3600
1.24k
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3601
1.24k
        goto FAILED;
3602
0
      ptr = tempptr;
3603
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3604
0
        {
3605
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3606
0
        }
3607
0
      escape = 0;                 /* Treat as literal character */
3608
0
      }
3609
3610
    /* The escape was a data escape or literal character. */
3611
3612
219k
    if (escape == 0)
3613
76.4k
      {
3614
76.4k
      PARSED_LITERAL(c, parsed_pattern);
3615
76.4k
      }
3616
3617
    /* The escape was a back (or forward) reference. We keep the offset in
3618
    order to give a more useful diagnostic for a bad forward reference. For
3619
    references to groups numbered less than 10 we can't use more than two items
3620
    in parsed_pattern because they may be just two characters in the input (and
3621
    in a 64-bit world an offset may need two elements). So for them, the offset
3622
    of the first occurrent is held in a special vector. */
3623
3624
143k
    else if (escape < 0)
3625
12.0k
      {
3626
12.0k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3627
12.0k
      escape = -escape - 1;
3628
12.0k
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3629
12.0k
      if (escape < 10)
3630
10.8k
        {
3631
10.8k
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3632
5.66k
          cb->small_ref_offset[escape] = offset;
3633
10.8k
        }
3634
1.14k
      else
3635
1.14k
        {
3636
1.14k
        PUTOFFSET(offset, parsed_pattern);
3637
1.14k
        }
3638
12.0k
      okquantifier = TRUE;
3639
12.0k
      }
3640
3641
    /* The escape was a character class such as \d etc. or other special
3642
    escape indicator such as \A or \X. Most of them generate just a single
3643
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3644
    value. They are supported only when Unicode is available. The type and
3645
    value are packed into a single 32-bit value so that the whole sequences
3646
    uses only two elements in the parsed_vector. This is because the same
3647
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3648
    set.
3649
3650
    There are also some cases where the escape sequence is followed by a name:
3651
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3652
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3653
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3654
    and returned as a negative value (handled above). A name is coded as an
3655
    offset into the pattern and a length. */
3656
3657
131k
    else switch (escape)
3658
131k
      {
3659
3
      case ESC_C:
3660
3
#ifdef NEVER_BACKSLASH_C
3661
3
      errorcode = ERR85;
3662
3
      goto ESCAPE_FAILED;
3663
#else
3664
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3665
        {
3666
        errorcode = ERR83;
3667
        goto ESCAPE_FAILED;
3668
        }
3669
#endif
3670
0
      okquantifier = TRUE;
3671
0
      *parsed_pattern++ = META_ESCAPE + escape;
3672
0
      break;
3673
3674
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3675
      when \u{ is not followed by hex digits and }. It requests two literal
3676
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3677
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3678
3679
0
      case ESC_ub:
3680
0
      *parsed_pattern++ = CHAR_u;
3681
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3682
0
      break;
3683
3684
5.57k
      case ESC_X:
3685
#ifndef SUPPORT_UNICODE
3686
      errorcode = ERR45;   /* Supported only with Unicode support */
3687
      goto ESCAPE_FAILED;
3688
#endif
3689
17.0k
      case ESC_H:
3690
26.8k
      case ESC_h:
3691
28.6k
      case ESC_N:
3692
34.3k
      case ESC_R:
3693
37.7k
      case ESC_V:
3694
41.1k
      case ESC_v:
3695
41.1k
      okquantifier = TRUE;
3696
41.1k
      *parsed_pattern++ = META_ESCAPE + escape;
3697
41.1k
      break;
3698
3699
19.4k
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3700
19.4k
      *parsed_pattern++ = META_ESCAPE + escape;
3701
19.4k
      break;
3702
3703
      /* Escapes that may change in UCP mode. */
3704
3705
7.89k
      case ESC_d:
3706
20.0k
      case ESC_D:
3707
27.1k
      case ESC_s:
3708
41.0k
      case ESC_S:
3709
50.6k
      case ESC_w:
3710
59.4k
      case ESC_W:
3711
59.4k
      okquantifier = TRUE;
3712
59.4k
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3713
59.4k
        xoptions);
3714
59.4k
      break;
3715
3716
      /* Unicode property matching */
3717
3718
3.99k
      case ESC_P:
3719
6.68k
      case ESC_p:
3720
6.68k
#ifdef SUPPORT_UNICODE
3721
6.68k
        {
3722
6.68k
        BOOL negated;
3723
6.68k
        uint16_t ptype = 0, pdata = 0;
3724
6.68k
        if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
3725
242
          goto ESCAPE_FAILED;
3726
6.44k
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3727
6.44k
        *parsed_pattern++ = META_ESCAPE + escape;
3728
6.44k
        *parsed_pattern++ = (ptype << 16) | pdata;
3729
6.44k
        okquantifier = TRUE;
3730
6.44k
        }
3731
#else
3732
      errorcode = ERR45;
3733
      goto ESCAPE_FAILED;
3734
#endif
3735
0
      break;  /* End \P and \p */
3736
3737
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3738
      numerical or named subroutine call, and control comes here. When used
3739
      with brace delimiters it is a numerical back reference and does not come
3740
      here because check_escape() returns it directly as a reference. \k is
3741
      always a named back reference. */
3742
3743
671
      case ESC_g:
3744
4.63k
      case ESC_k:
3745
4.63k
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3746
3.65k
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3747
16
        {
3748
16
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3749
16
        goto ESCAPE_FAILED;
3750
16
        }
3751
4.61k
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3752
2.65k
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3753
987
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3754
3755
      /* For a non-braced \g, check for a numerical recursion. */
3756
3757
4.61k
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3758
671
        {
3759
671
        PCRE2_SPTR p = ptr + 1;
3760
3761
671
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3762
671
            &errorcode))
3763
237
          {
3764
237
          if (p >= ptrend || *p != terminator)
3765
9
            {
3766
9
            ptr = p;
3767
9
            errorcode = ERR119;  /* Missing terminator for number */
3768
9
            goto ESCAPE_FAILED;
3769
9
            }
3770
228
          ptr = p + 1;
3771
228
          goto SET_RECURSION;
3772
237
          }
3773
434
        if (errorcode != 0) goto ESCAPE_FAILED;
3774
434
        }
3775
3776
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3777
      before } but not for other delimiters. */
3778
3779
4.37k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3780
4.37k
          &errorcode, cb)) goto ESCAPE_FAILED;
3781
3782
      /* \k and \g when used with braces are back references, whereas \g used
3783
      with quotes or angle brackets is a recursion */
3784
3785
4.25k
      *parsed_pattern++ =
3786
4.25k
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3787
3.84k
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3788
4.25k
      *parsed_pattern++ = namelen;
3789
3790
4.25k
      PUTOFFSET(offset, parsed_pattern);
3791
4.25k
      okquantifier = TRUE;
3792
4.25k
      break;  /* End special escape processing */
3793
131k
      }
3794
219k
    break;    /* End escape sequence processing */
3795
3796
3797
    /* ---- Single-character special items ---- */
3798
3799
249k
    case CHAR_CIRCUMFLEX_ACCENT:
3800
249k
    *parsed_pattern++ = META_CIRCUMFLEX;
3801
249k
    break;
3802
3803
9.04k
    case CHAR_DOLLAR_SIGN:
3804
9.04k
    *parsed_pattern++ = META_DOLLAR;
3805
9.04k
    break;
3806
3807
20.8k
    case CHAR_DOT:
3808
20.8k
    *parsed_pattern++ = META_DOT;
3809
20.8k
    okquantifier = TRUE;
3810
20.8k
    break;
3811
3812
3813
    /* ---- Single-character quantifiers ---- */
3814
3815
79.0k
    case CHAR_ASTERISK:
3816
79.0k
    meta_quantifier = META_ASTERISK;
3817
79.0k
    goto CHECK_QUANTIFIER;
3818
3819
164k
    case CHAR_PLUS:
3820
164k
    meta_quantifier = META_PLUS;
3821
164k
    goto CHECK_QUANTIFIER;
3822
3823
72.4k
    case CHAR_QUESTION_MARK:
3824
72.4k
    meta_quantifier = META_QUERY;
3825
72.4k
    goto CHECK_QUANTIFIER;
3826
3827
3828
    /* ---- Potential {n,m} quantifier ---- */
3829
3830
80.7k
    case CHAR_LEFT_CURLY_BRACKET:
3831
80.7k
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3832
80.7k
        &errorcode))
3833
30.1k
      {
3834
30.1k
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3835
30.0k
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3836
30.0k
      break;                               /* No more quantifier processing */
3837
30.1k
      }
3838
50.6k
    meta_quantifier = META_MINMAX;
3839
    /* Fall through */
3840
3841
3842
    /* ---- Quantifier post-processing ---- */
3843
3844
    /* Check that a quantifier is allowed after the previous item. This
3845
    guarantees that there is a previous item. */
3846
3847
366k
    CHECK_QUANTIFIER:
3848
366k
    if (!prev_okquantifier)
3849
169
      {
3850
169
      errorcode = ERR9;
3851
169
      goto FAILED;
3852
169
      }
3853
3854
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3855
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3856
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3857
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3858
    (*MARK) for when (*ACCEPT) has an argument. */
3859
3860
366k
    if (*prev_parsed_item == META_ACCEPT)
3861
757
      {
3862
757
      uint32_t *p;
3863
2.22k
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3864
757
      *verbstartptr = META_NOCAPTURE;
3865
757
      parsed_pattern[1] = META_KET;
3866
757
      parsed_pattern += 2;
3867
3868
#ifdef PCRE2_DEBUG
3869
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3870
      parsed_pattern_extra -= 2;
3871
#endif
3872
757
      }
3873
3874
    /* Now we can put the quantifier into the parsed pattern vector. At this
3875
    stage, we have only the basic quantifier. The check for a following + or ?
3876
    modifier happens at the top of the loop, after any intervening comments
3877
    have been removed. */
3878
3879
366k
    *parsed_pattern++ = meta_quantifier;
3880
366k
    if (c == CHAR_LEFT_CURLY_BRACKET)
3881
50.5k
      {
3882
50.5k
      *parsed_pattern++ = min_repeat;
3883
50.5k
      *parsed_pattern++ = max_repeat;
3884
50.5k
      }
3885
366k
    break;
3886
3887
3888
    /* ---- Character class ---- */
3889
3890
89.2k
    case CHAR_LEFT_SQUARE_BRACKET:
3891
3892
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3893
    used for "start of word" and "end of word". As these are otherwise illegal
3894
    sequences, we don't break anything by recognizing them. They are replaced
3895
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3896
    erroneous and are handled by the normal code below. */
3897
3898
89.2k
    if (ptrend - ptr >= 6 &&
3899
85.2k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3900
85.2k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3901
0
      {
3902
0
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3903
3904
0
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3905
0
        {
3906
0
        *parsed_pattern++ = META_LOOKAHEAD;
3907
0
        }
3908
0
      else
3909
0
        {
3910
0
        *parsed_pattern++ = META_LOOKBEHIND;
3911
0
        *has_lookbehind = TRUE;
3912
3913
        /* The offset is used only for the "non-fixed length" error; this won't
3914
        occur here, so just store zero. */
3915
3916
0
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3917
0
        }
3918
3919
0
      if ((options & PCRE2_UCP) == 0)
3920
0
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3921
0
      else
3922
0
        {
3923
0
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3924
0
        *parsed_pattern++ = PT_WORD << 16;
3925
0
        }
3926
0
      *parsed_pattern++ = META_KET;
3927
0
      ptr += 6;
3928
0
      okquantifier = TRUE;
3929
0
      break;
3930
0
      }
3931
3932
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3933
    they are encountered at the top level, so we'll do that too. */
3934
3935
89.2k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3936
86.0k
         *ptr == CHAR_EQUALS_SIGN) &&
3937
4.23k
        check_posix_syntax(ptr, ptrend, &tempptr))
3938
48
      {
3939
48
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3940
48
      ptr = tempptr + 2;
3941
48
      goto FAILED;
3942
48
      }
3943
3944
89.1k
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3945
77.5k
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3946
3947
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3948
    set c to the '[' character, and ptr to just after the '['. */
3949
3950
90.1k
    FROM_PERL_EXTENDED_CLASS:
3951
90.1k
    okquantifier = TRUE;
3952
3953
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3954
    because there are holes in the encoding, and simply using the range A-Z
3955
    (for example) would include the characters in the holes. This applies only
3956
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3957
    in this respect. In order to accommodate this, we keep track of whether
3958
    character values are literal or not, and a state variable for handling
3959
    ranges. */
3960
3961
    /* Loop for the contents of the class. Classes may be nested, if
3962
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3963
3964
    /* c is still set to '[' so the loop will handle the start of the class. */
3965
3966
90.1k
    class_depth_m1 = -1;
3967
90.1k
    class_maxdepth_m1 = -1;
3968
90.1k
    class_range_state = RANGE_NO;
3969
90.1k
    class_op_state = CLASS_OP_EMPTY;
3970
90.1k
    class_start = NULL;
3971
3972
90.1k
    for (;;)
3973
2.37M
      {
3974
2.37M
      BOOL char_is_literal = TRUE;
3975
3976
      /* Inside \Q...\E everything is literal except \E */
3977
3978
2.37M
      if (inescq)
3979
1.48k
        {
3980
1.48k
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3981
206
          {
3982
206
          inescq = FALSE;                   /* Reset literal state */
3983
206
          ptr++;                            /* Skip the 'E' */
3984
206
          goto CLASS_CONTINUE;
3985
206
          }
3986
3987
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3988
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3989
        were're only giving an error if the \Q..\E is non-empty. */
3990
3991
1.27k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3992
3
          {
3993
3
          errorcode = ERR116;
3994
3
          goto FAILED;
3995
3
          }
3996
3997
1.27k
        goto CLASS_LITERAL;
3998
1.27k
        }
3999
4000
      /* Skip over space and tab (only) in extended-more mode, or anywhere
4001
      inside a Perl extended class (which implies /xx). */
4002
4003
2.37M
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
4004
6.68k
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
4005
6.14k
           class_mode_state >= CLASS_MODE_PERL_EXT))
4006
1.08k
        goto CLASS_CONTINUE;
4007
4008
      /* Handle POSIX class names. Perl allows a negation extension of the
4009
      form [:^name:]. A square bracket that doesn't match the syntax is
4010
      treated as a literal. We also recognize the POSIX constructions
4011
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4012
      5.6 and 5.8 do. */
4013
4014
2.37M
      if (class_depth_m1 >= 0 &&
4015
2.28M
          c == CHAR_LEFT_SQUARE_BRACKET &&
4016
53.1k
          ptrend - ptr >= 3 &&
4017
52.8k
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
4018
33.9k
           *ptr == CHAR_EQUALS_SIGN) &&
4019
19.5k
          check_posix_syntax(ptr, ptrend, &tempptr))
4020
14.9k
        {
4021
14.9k
        BOOL posix_negate = FALSE;
4022
14.9k
        int posix_class;
4023
4024
        /* Perl treats a hyphen before a POSIX class as a literal, not the
4025
        start of a range. However, it gives a warning in its warning mode. PCRE
4026
        does not have a warning mode, so we give an error, because this is
4027
        likely an error on the user's part. */
4028
4029
14.9k
        if (class_range_state == RANGE_STARTED)
4030
3
          {
4031
3
          ptr = tempptr + 2;
4032
3
          errorcode = ERR50;
4033
3
          goto FAILED;
4034
3
          }
4035
4036
        /* Perl treats a hyphen after a POSIX class as a literal, not the
4037
        start of a range. However, it gives a warning in its warning mode
4038
        unless the hyphen is the last character in the class. PCRE does not
4039
        have a warning mode, so we give an error, because this is likely an
4040
        error on the user's part.
4041
4042
        Roll back to the hyphen for the error position. */
4043
4044
14.8k
        if (class_range_state == RANGE_FORBID_STARTED)
4045
3
          {
4046
3
          ptr = class_range_forbid_ptr;
4047
3
          errorcode = ERR50;
4048
3
          goto FAILED;
4049
3
          }
4050
4051
        /* Disallow implicit union in Perl extended classes. */
4052
4053
14.8k
        if (class_op_state == CLASS_OP_OPERAND &&
4054
9.15k
            class_mode_state == CLASS_MODE_PERL_EXT)
4055
3
          {
4056
3
          ptr = tempptr + 2;
4057
3
          errorcode = ERR113;
4058
3
          goto FAILED;
4059
3
          }
4060
4061
14.8k
        if (*ptr != CHAR_COLON)
4062
3
          {
4063
3
          ptr = tempptr + 2;
4064
3
          errorcode = ERR13;
4065
3
          goto FAILED;
4066
3
          }
4067
4068
14.8k
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
4069
1.43k
          {
4070
1.43k
          posix_negate = TRUE;
4071
1.43k
          ptr++;
4072
1.43k
          }
4073
4074
14.8k
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4075
14.8k
        ptr = tempptr + 2;
4076
14.8k
        if (posix_class < 0)
4077
46
          {
4078
46
          errorcode = ERR30;
4079
46
          goto FAILED;
4080
46
          }
4081
4082
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
4083
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
4084
        (because it would be interpreted as range). */
4085
4086
14.8k
        class_range_state = RANGE_FORBID_NO;
4087
14.8k
        class_op_state = CLASS_OP_OPERAND;
4088
4089
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
4090
        of the POSIX classes are converted to use Unicode properties \p or \P
4091
        or, in one case, \h or \H. The substitutes table has two values per
4092
        class, containing the type and value of a \p or \P item. The special
4093
        cases are specified with a negative type: a non-zero value causes \h or
4094
        \H to be used, and a zero value falls through to behave like a non-UCP
4095
        POSIX class. There are now also some extra options that force ASCII for
4096
        some classes. */
4097
4098
14.8k
#ifdef SUPPORT_UNICODE
4099
14.8k
        if ((options & PCRE2_UCP) != 0 &&
4100
6.93k
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
4101
6.64k
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
4102
673
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
4103
6.24k
          {
4104
6.24k
          int ptype = posix_substitutes[2*posix_class];
4105
6.24k
          int pvalue = posix_substitutes[2*posix_class + 1];
4106
4107
6.24k
          if (ptype >= 0)
4108
5.36k
            {
4109
5.36k
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
4110
5.36k
            *parsed_pattern++ = (ptype << 16) | pvalue;
4111
5.36k
            goto CLASS_CONTINUE;
4112
5.36k
            }
4113
4114
886
          if (pvalue != 0)
4115
207
            {
4116
207
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
4117
207
            goto CLASS_CONTINUE;
4118
207
            }
4119
4120
          /* Fall through */
4121
886
          }
4122
9.27k
#endif  /* SUPPORT_UNICODE */
4123
4124
        /* Non-UCP POSIX class */
4125
4126
9.27k
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
4127
9.27k
        *parsed_pattern++ = posix_class;
4128
9.27k
        }
4129
4130
      /* Check for the start of the outermost class, or the start of a nested class. */
4131
4132
2.35M
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
4133
128k
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
4134
25.5k
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
4135
2.25M
               (c == CHAR_LEFT_PARENTHESIS &&
4136
13.2k
                class_mode_state == CLASS_MODE_PERL_EXT))
4137
104k
        {
4138
104k
        uint32_t start_c = c;
4139
104k
        uint32_t new_class_mode_state;
4140
4141
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
4142
        class. */
4143
4144
104k
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
4145
104k
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
4146
1.56k
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
4147
103k
        else
4148
103k
          new_class_mode_state = class_mode_state;
4149
4150
        /* Tidy up the other class before starting the nested class. */
4151
        /* -[ beginning a nested class is a literal '-' */
4152
4153
104k
        if (class_range_state == RANGE_STARTED)
4154
201
          parsed_pattern[-1] = CHAR_MINUS;
4155
4156
        /* Disallow implicit union in Perl extended classes. */
4157
4158
104k
        if (class_op_state == CLASS_OP_OPERAND &&
4159
10.7k
            class_mode_state == CLASS_MODE_PERL_EXT)
4160
4
          {
4161
4
          errorcode = ERR113;
4162
4
          goto FAILED;
4163
4
          }
4164
4165
        /* Validate nesting depth */
4166
104k
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
4167
7
          {
4168
7
          ptr--;  /* Point rightwards at the paren, same as ERR19. */
4169
7
          errorcode = ERR107;  /* Classes too deeply nested */
4170
7
          goto FAILED;
4171
7
          }
4172
4173
        /* Process the character class start. If the first character is '^', set
4174
        the negation flag. If the first few characters (either before or after ^)
4175
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
4176
        This makes for compatibility with Perl. */
4177
4178
104k
        negate_class = FALSE;
4179
104k
        for (;;)
4180
155k
          {
4181
155k
          if (ptr >= ptrend)
4182
72
            {
4183
72
            if (start_c == CHAR_LEFT_PARENTHESIS)
4184
6
              errorcode = ERR14;  /* Missing terminating ')' */
4185
66
            else
4186
66
              errorcode = ERR6;   /* Missing terminating ']' */
4187
72
            goto FAILED;
4188
72
            }
4189
4190
155k
          GETCHARINCTEST(c, ptr);
4191
155k
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
4192
153k
          else if (c == CHAR_BACKSLASH)
4193
8.86k
            {
4194
8.86k
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
4195
8.64k
            else if (ptrend - ptr >= 3 &&
4196
8.44k
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4197
195
              ptr += 3;
4198
8.45k
            else
4199
8.45k
              break;
4200
8.86k
            }
4201
144k
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4202
1.10k
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4203
785
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4204
578
            continue;
4205
144k
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4206
49.2k
            negate_class = TRUE;
4207
94.9k
          else break;
4208
155k
          }
4209
4210
        /* Now the real contents of the class; c has the first "real" character.
4211
        Empty classes are permitted only if the option is set, and if it's not
4212
        a Perl-extended class. */
4213
4214
104k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4215
8.72k
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4216
4.47k
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4217
4.16k
          {
4218
4.16k
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4219
4220
4.16k
          if (class_start != NULL)
4221
1.22k
            {
4222
1.22k
            PCRE2_ASSERT(class_depth_m1 >= 0);
4223
            /* Represents that the class is an extended class. */
4224
1.22k
            *class_start |= CLASS_IS_ECLASS;
4225
1.22k
            class_start = NULL;
4226
1.22k
            }
4227
4228
4.16k
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4229
4230
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4231
          very first (top-level) class being empty. */
4232
4.16k
          if (class_depth_m1 < 0) break;
4233
4234
2.21k
          class_range_state = RANGE_NO; /* for processing the containing class */
4235
2.21k
          class_op_state = CLASS_OP_OPERAND;
4236
2.21k
          goto CLASS_CONTINUE;
4237
4.16k
          }
4238
4239
        /* Enter a non-empty class. */
4240
4241
100k
        if (class_start != NULL)
4242
5.51k
          {
4243
5.51k
          PCRE2_ASSERT(class_depth_m1 >= 0);
4244
          /* Represents that the class is an extended class. */
4245
5.51k
          *class_start |= CLASS_IS_ECLASS;
4246
5.51k
          class_start = NULL;
4247
5.51k
          }
4248
4249
100k
        class_start = parsed_pattern;
4250
100k
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4251
100k
        class_range_state = RANGE_NO;
4252
100k
        class_op_state = CLASS_OP_EMPTY;
4253
100k
        class_mode_state = new_class_mode_state;
4254
100k
        ++class_depth_m1;
4255
100k
        if (class_maxdepth_m1 < class_depth_m1)
4256
93.2k
          class_maxdepth_m1 = class_depth_m1;
4257
        /* Reset; no op seen yet at new depth. */
4258
100k
        cb->class_op_used[class_depth_m1] = 0;
4259
4260
        /* Implement the special start-of-class literal meaning of ']'. */
4261
100k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4262
4.55k
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4263
4.55k
          {
4264
4.55k
          class_range_state = RANGE_OK_LITERAL;
4265
4.55k
          class_op_state = CLASS_OP_OPERAND;
4266
4.55k
          PARSED_LITERAL(c, parsed_pattern);
4267
4.55k
          goto CLASS_CONTINUE;
4268
4.55k
          }
4269
4270
96.1k
        continue;  /* We have already loaded c with the next character */
4271
100k
        }
4272
4273
      /* Check for the end of the class. */
4274
4275
2.25M
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4276
2.15M
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4277
98.0k
        {
4278
        /* In Perl extended mode, the ']' can only be used to match the
4279
        opening '[', and ')' must match an opening parenthesis. */
4280
98.0k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4281
1.04k
          {
4282
1.04k
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4283
3
            {
4284
3
            errorcode = ERR14;
4285
3
            ptr--;  /* Correct the offset */
4286
3
            goto FAILED;
4287
3
            }
4288
1.04k
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4289
5
            {
4290
5
            errorcode = ERR22;
4291
5
            goto FAILED;
4292
5
            }
4293
1.04k
          }
4294
4295
        /* Check no trailing operator. */
4296
98.0k
        if (class_op_state == CLASS_OP_OPERATOR)
4297
3
          {
4298
3
          errorcode = ERR110;
4299
3
          goto FAILED;
4300
3
          }
4301
4302
        /* Check no empty expression for Perl extended expressions. */
4303
98.0k
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4304
1.03k
            class_op_state == CLASS_OP_EMPTY)
4305
5
          {
4306
5
          errorcode = ERR114;
4307
5
          goto FAILED;
4308
5
          }
4309
4310
        /* -] at the end of a class is a literal '-' */
4311
98.0k
        if (class_range_state == RANGE_STARTED)
4312
373
          parsed_pattern[-1] = CHAR_MINUS;
4313
4314
98.0k
        *parsed_pattern++ = META_CLASS_END;
4315
4316
98.0k
        if (--class_depth_m1 < 0)
4317
86.0k
          {
4318
          /* Check for and consume ')' after '(?[...]'. */
4319
86.0k
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4320
86.0k
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4321
679
            {
4322
679
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4323
8
              {
4324
8
              errorcode = ERR115;
4325
8
              goto FAILED;
4326
8
              }
4327
4328
671
            ptr++;
4329
671
            }
4330
4331
86.0k
          break;
4332
86.0k
          }
4333
4334
11.9k
        class_range_state = RANGE_NO; /* for processing the containing class */
4335
11.9k
        class_op_state = CLASS_OP_OPERAND;
4336
11.9k
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4337
1.50k
          class_mode_state = CLASS_MODE_PERL_EXT;
4338
        /* The extended class flag has already
4339
        been set for the parent class. */
4340
11.9k
        class_start = NULL;
4341
11.9k
        }
4342
4343
      /* Handle a Perl set binary operator */
4344
4345
2.15M
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4346
3.75k
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4347
3.05k
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4348
1.88k
        {
4349
        /* Check that there was a preceding operand. */
4350
1.88k
        if (class_op_state != CLASS_OP_OPERAND)
4351
24
          {
4352
24
          errorcode = ERR109;
4353
24
          goto FAILED;
4354
24
          }
4355
4356
1.86k
        if (class_start != NULL)
4357
129
          {
4358
129
          PCRE2_ASSERT(class_depth_m1 >= 0);
4359
          /* Represents that the class is an extended class. */
4360
129
          *class_start |= CLASS_IS_ECLASS;
4361
129
          class_start = NULL;
4362
129
          }
4363
4364
1.86k
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4365
1.86k
                     class_range_state != RANGE_FORBID_STARTED);
4366
4367
1.86k
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4368
1.86k
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4369
1.66k
                            c == CHAR_MINUS? META_ECLASS_SUB :
4370
1.44k
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4371
1.17k
                            META_ECLASS_XOR;
4372
1.86k
        class_range_state = RANGE_NO;
4373
1.86k
        class_op_state = CLASS_OP_OPERATOR;
4374
1.86k
        }
4375
4376
      /* Handle a Perl set unary operator */
4377
4378
2.15M
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4379
1.86k
               c == CHAR_EXCLAMATION_MARK)
4380
695
        {
4381
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4382
        start of the class, or follows an operator). */
4383
695
        if (class_op_state == CLASS_OP_OPERAND)
4384
3
          {
4385
3
          errorcode = ERR113;
4386
3
          goto FAILED;
4387
3
          }
4388
4389
692
        if (class_start != NULL)
4390
352
          {
4391
352
          PCRE2_ASSERT(class_depth_m1 >= 0);
4392
          /* Represents that the class is an extended class. */
4393
352
          *class_start |= CLASS_IS_ECLASS;
4394
352
          class_start = NULL;
4395
352
          }
4396
4397
692
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4398
692
                     class_range_state != RANGE_FORBID_STARTED);
4399
4400
692
        *parsed_pattern++ = META_ECLASS_NOT;
4401
692
        class_range_state = RANGE_NO;
4402
692
        class_op_state = CLASS_OP_OPERATOR;
4403
692
        }
4404
4405
      /* Handle a UTS#18 set operator */
4406
4407
2.15M
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4408
489k
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4409
465k
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4410
31.1k
               ptr < ptrend && *ptr == c)
4411
6.96k
        {
4412
6.96k
        ++ptr;
4413
4414
        /* Check there isn't a triple-repetition. */
4415
6.96k
        if (ptr < ptrend && *ptr == c)
4416
14
          {
4417
247
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4418
14
          errorcode = ERR108;
4419
14
          goto FAILED;
4420
14
          }
4421
4422
        /* Check for a preceding operand. */
4423
6.95k
        if (class_op_state != CLASS_OP_OPERAND)
4424
3
          {
4425
3
          errorcode = ERR109;
4426
3
          goto FAILED;
4427
3
          }
4428
4429
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4430
6.94k
        if (cb->class_op_used[class_depth_m1] != 0 &&
4431
5.69k
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4432
1
          {
4433
1
          errorcode = ERR111;
4434
1
          goto FAILED;
4435
1
          }
4436
4437
6.94k
        if (class_start != NULL)
4438
1.13k
          {
4439
1.13k
          PCRE2_ASSERT(class_depth_m1 >= 0);
4440
          /* Represents that the class is an extended class. */
4441
1.13k
          *class_start |= CLASS_IS_ECLASS;
4442
1.13k
          class_start = NULL;
4443
1.13k
          }
4444
4445
        /* Dangling '-' before an operator is a literal */
4446
6.94k
        if (class_range_state == RANGE_STARTED)
4447
284
          parsed_pattern[-1] = CHAR_MINUS;
4448
4449
6.94k
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4450
6.94k
                            c == CHAR_MINUS? META_ECLASS_SUB :
4451
6.53k
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4452
6.04k
                            META_ECLASS_XOR;
4453
6.94k
        class_range_state = RANGE_NO;
4454
6.94k
        class_op_state = CLASS_OP_OPERATOR;
4455
6.94k
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4456
6.94k
        }
4457
4458
      /* Handle escapes in a class */
4459
4460
2.14M
      else if (c == CHAR_BACKSLASH)
4461
59.7k
        {
4462
59.7k
        tempptr = ptr;
4463
59.7k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4464
59.7k
          xoptions, cb->bracount, TRUE, cb);
4465
4466
59.7k
        if (errorcode != 0)
4467
44
          {
4468
44
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4469
0
              class_mode_state >= CLASS_MODE_PERL_EXT)
4470
44
            goto FAILED;
4471
0
          ptr = tempptr;
4472
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4473
0
            {
4474
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4475
0
            }
4476
0
          escape = 0;                 /* Treat as literal character */
4477
0
          }
4478
4479
59.6k
        switch(escape)
4480
59.6k
          {
4481
20.4k
          case 0:  /* Escaped character code point is in c */
4482
20.4k
          char_is_literal = FALSE;
4483
20.4k
          goto CLASS_LITERAL;      /* (a few lines above) */
4484
4485
1.27k
          case ESC_b:
4486
1.27k
          c = CHAR_BS;    /* \b is backspace in a class */
4487
1.27k
          char_is_literal = FALSE;
4488
1.27k
          goto CLASS_LITERAL;
4489
4490
214
          case ESC_k:
4491
214
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4492
214
          char_is_literal = FALSE;
4493
214
          goto CLASS_LITERAL;
4494
4495
248
          case ESC_Q:
4496
248
          inescq = TRUE;  /* Enter literal mode */
4497
248
          goto CLASS_CONTINUE;
4498
4499
277
          case ESC_E:     /* Ignore orphan \E */
4500
277
          goto CLASS_CONTINUE;
4501
4502
8
          case ESC_B:     /* Always an error in a class */
4503
14
          case ESC_R:
4504
22
          case ESC_X:
4505
22
          errorcode = ERR7;
4506
22
          goto FAILED;
4507
4508
7
          case ESC_N:     /* Not permitted by Perl either */
4509
7
          errorcode = ERR71;
4510
7
          goto FAILED;
4511
4512
4.08k
          case ESC_H:
4513
5.17k
          case ESC_h:
4514
6.13k
          case ESC_V:
4515
6.96k
          case ESC_v:
4516
6.96k
          *parsed_pattern++ = META_ESCAPE + escape;
4517
6.96k
          break;
4518
4519
          /* These escapes may be converted to Unicode property tests when
4520
          PCRE2_UCP is set. */
4521
4522
3.12k
          case ESC_d:
4523
11.1k
          case ESC_D:
4524
14.0k
          case ESC_s:
4525
20.5k
          case ESC_S:
4526
22.9k
          case ESC_w:
4527
26.0k
          case ESC_W:
4528
26.0k
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4529
26.0k
            xoptions);
4530
26.0k
          break;
4531
4532
          /* Explicit Unicode property matching */
4533
4534
2.96k
          case ESC_P:
4535
4.12k
          case ESC_p:
4536
4.12k
#ifdef SUPPORT_UNICODE
4537
4.12k
            {
4538
4.12k
            BOOL negated;
4539
4.12k
            uint16_t ptype = 0, pdata = 0;
4540
4.12k
            if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
4541
21
              goto FAILED;
4542
4543
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4544
            get converted to the general characteristic L&. That is, upper,
4545
            lower, and title case letters are all conflated. */
4546
4547
4.10k
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4548
994
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4549
927
              {
4550
927
              ptype = PT_LAMP;
4551
927
              pdata = 0;
4552
927
              }
4553
4554
4.10k
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4555
4.10k
            *parsed_pattern++ = META_ESCAPE + escape;
4556
4.10k
            *parsed_pattern++ = (ptype << 16) | pdata;
4557
4.10k
            }
4558
#else
4559
          errorcode = ERR45;
4560
          goto FAILED;
4561
#endif
4562
0
          break;  /* End \P and \p */
4563
4564
          /* All others are not allowed in a class */
4565
4566
          /* LCOV_EXCL_START */
4567
0
          default:
4568
0
          PCRE2_DEBUG_UNREACHABLE();
4569
0
          PCRE2_FALLTHROUGH /* Fall through */
4570
          /* LCOV_EXCL_STOP */
4571
4572
3
          case ESC_A:
4573
12
          case ESC_Z:
4574
17
          case ESC_z:
4575
17
          case ESC_G:
4576
18
          case ESC_K:
4577
18
          case ESC_C:
4578
18
          errorcode = ERR7;
4579
18
          goto FAILED;
4580
59.6k
          }
4581
4582
        /* All the switch-cases above which end in "break" describe a set
4583
        of characters. None may start a range. */
4584
4585
        /* The second part of a range can be a single-character escape
4586
        sequence (detected above), but not any of the other escapes. Perl
4587
        treats a hyphen as a literal in such circumstances. However, in Perl's
4588
        warning mode, a warning is given, so PCRE now faults it, as it is
4589
        almost certainly a mistake on the user's part. */
4590
4591
37.1k
        if (class_range_state == RANGE_STARTED)
4592
3
          {
4593
3
          errorcode = ERR50;
4594
3
          goto FAILED;
4595
3
          }
4596
4597
        /* Perl gives a warning unless the hyphen following a multi-character
4598
        escape is the last character in the class. PCRE throws an error. */
4599
4600
37.1k
        if (class_range_state == RANGE_FORBID_STARTED)
4601
3
          {
4602
3
          ptr = class_range_forbid_ptr;
4603
3
          errorcode = ERR50;
4604
3
          goto FAILED;
4605
3
          }
4606
4607
        /* Disallow implicit union in Perl extended classes. */
4608
4609
37.1k
        if (class_op_state == CLASS_OP_OPERAND &&
4610
30.1k
            class_mode_state == CLASS_MODE_PERL_EXT)
4611
3
          {
4612
3
          errorcode = ERR113;
4613
3
          goto FAILED;
4614
3
          }
4615
4616
37.1k
        class_range_state = RANGE_FORBID_NO;
4617
37.1k
        class_op_state = CLASS_OP_OPERAND;
4618
37.1k
        }
4619
4620
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4621
      Perl extended class. */
4622
4623
2.08M
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4624
75
        {
4625
75
        errorcode = ERR116;
4626
75
        goto FAILED;
4627
75
        }
4628
4629
      /* Handle potential start of range */
4630
4631
2.08M
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4632
42.6k
        {
4633
42.6k
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4634
42.2k
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4635
42.6k
        class_range_state = RANGE_STARTED;
4636
42.6k
        }
4637
4638
      /* Handle forbidden start of range */
4639
4640
2.04M
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4641
304
        {
4642
304
        *parsed_pattern++ = CHAR_MINUS;
4643
304
        class_range_state = RANGE_FORBID_STARTED;
4644
304
        class_range_forbid_ptr = ptr;
4645
304
        }
4646
4647
      /* Handle a literal character */
4648
4649
2.04M
      else
4650
2.04M
        {
4651
2.06M
        CLASS_LITERAL:
4652
4653
        /* Disallow implicit union in Perl extended classes. */
4654
4655
2.06M
        if (class_op_state == CLASS_OP_OPERAND &&
4656
1.97M
            class_mode_state == CLASS_MODE_PERL_EXT)
4657
3
          {
4658
3
          errorcode = ERR113;
4659
3
          goto FAILED;
4660
3
          }
4661
4662
2.06M
        if (class_range_state == RANGE_STARTED)
4663
41.8k
          {
4664
41.8k
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4665
325
            parsed_pattern--;
4666
41.4k
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4667
121
            {
4668
121
            errorcode = ERR8;
4669
121
            goto FAILED;
4670
121
            }
4671
41.3k
          else
4672
41.3k
            {
4673
41.3k
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4674
331
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4675
41.3k
            PARSED_LITERAL(c, parsed_pattern);
4676
41.3k
            }
4677
41.7k
          class_range_state = RANGE_NO;
4678
41.7k
          class_op_state = CLASS_OP_OPERAND;
4679
41.7k
          }
4680
2.02M
        else if (class_range_state == RANGE_FORBID_STARTED)
4681
9
          {
4682
9
          ptr = class_range_forbid_ptr;
4683
9
          errorcode = ERR50;
4684
9
          goto FAILED;
4685
9
          }
4686
2.02M
        else  /* Potential start of range */
4687
2.02M
          {
4688
2.02M
          class_range_state = char_is_literal?
4689
2.00M
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4690
2.02M
          class_op_state = CLASS_OP_OPERAND;
4691
2.02M
          PARSED_LITERAL(c, parsed_pattern);
4692
2.02M
          }
4693
2.06M
        }
4694
4695
      /* Proceed to next thing in the class. */
4696
4697
2.19M
      CLASS_CONTINUE:
4698
2.19M
      if (ptr >= ptrend)
4699
1.57k
        {
4700
1.57k
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4701
13
          errorcode = ERR14;   /* Missing terminating ')' */
4702
1.57k
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4703
286
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4704
43
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4705
1.52k
        else
4706
1.52k
          errorcode = ERR6;    /* Missing terminating ']' */
4707
1.57k
        goto FAILED;
4708
1.57k
        }
4709
2.18M
      GETCHARINCTEST(c, ptr);
4710
2.18M
      }     /* End of class-processing loop */
4711
4712
88.0k
    break;  /* End of character class */
4713
4714
4715
    /* ---- Opening parenthesis ---- */
4716
4717
278k
    case CHAR_LEFT_PARENTHESIS:
4718
278k
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4719
4720
    /* If ( is not followed by ? it is either a capture or a special verb or an
4721
    alpha assertion or a positive non-atomic lookahead. */
4722
4723
278k
    if (*ptr != CHAR_QUESTION_MARK)
4724
155k
      {
4725
155k
      const char *vn;
4726
4727
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4728
      off). */
4729
4730
155k
      if (*ptr != CHAR_ASTERISK)
4731
129k
        {
4732
129k
        nest_depth++;
4733
129k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4734
113k
          {
4735
113k
          if (cb->bracount >= MAX_GROUP_NUMBER)
4736
0
            {
4737
0
            errorcode = ERR97;
4738
0
            goto FAILED;
4739
0
            }
4740
113k
          cb->bracount++;
4741
113k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4742
113k
          }
4743
16.2k
        else *parsed_pattern++ = META_NOCAPTURE;
4744
129k
        }
4745
4746
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4747
      quantifier" error rather than "(*MARK) must have an argument". */
4748
4749
26.3k
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4750
11
        break;
4751
4752
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4753
      synonyms for the historical symbolic assertions, but the script run and
4754
      non-atomic lookaround ones are new. They are distinguished by starting
4755
      with a lower case letter. Checking both ends of the alphabet makes this
4756
      work in all character codes. */
4757
4758
26.3k
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4759
4.71k
        {
4760
4.71k
        uint32_t meta;
4761
4762
4.71k
        vn = alasnames;
4763
4.71k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4764
4.71k
          &errorcode, cb)) goto FAILED;
4765
4.71k
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4766
4.70k
        if (*ptr != CHAR_COLON)
4767
23
          {
4768
23
          errorcode = ERR95;  /* Malformed */
4769
23
          goto FAILED_FORWARD;
4770
23
          }
4771
4772
        /* Scan the table of alpha assertion names */
4773
4774
65.4k
        for (i = 0; i < alascount; i++)
4775
65.4k
          {
4776
65.4k
          if (namelen == alasmeta[i].len &&
4777
13.3k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4778
4.67k
            break;
4779
60.7k
          vn += alasmeta[i].len + 1;
4780
60.7k
          }
4781
4782
4.68k
        if (i >= alascount)
4783
9
          {
4784
9
          errorcode = ERR95;  /* Alpha assertion not recognized */
4785
9
          goto FAILED;
4786
9
          }
4787
4788
        /* Check for expecting an assertion condition. If so, only atomic
4789
        lookaround assertions are valid. */
4790
4791
4.67k
        meta = alasmeta[i].meta;
4792
4.67k
        if (prev_expect_cond_assert > 0 &&
4793
310
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4794
6
          {
4795
6
          errorcode = ERR28;  /* Atomic assertion expected */
4796
6
          goto FAILED;
4797
6
          }
4798
4799
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4800
        to the code that handles the traditional symbolic forms. */
4801
4802
4.66k
        switch(meta)
4803
4.66k
          {
4804
          /* LCOV_EXCL_START */
4805
0
          default:
4806
0
          PCRE2_DEBUG_UNREACHABLE();
4807
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4808
0
          goto FAILED;        /* the meta values come from a table above. */
4809
          /* LCOV_EXCL_STOP */
4810
4811
0
          case META_ATOMIC:
4812
0
          goto ATOMIC_GROUP;
4813
4814
66
          case META_LOOKAHEAD:
4815
66
          goto POSITIVE_LOOK_AHEAD;
4816
4817
0
          case META_LOOKAHEAD_NA:
4818
0
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4819
4820
196
          case META_LOOKAHEADNOT:
4821
196
          goto NEGATIVE_LOOK_AHEAD;
4822
4823
786
          case META_SCS:
4824
786
          ptr++;
4825
786
          *parsed_pattern++ = META_SCS;
4826
4827
786
          parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
4828
786
                                              0, &errorcode, cb);
4829
786
          if (parsed_pattern == NULL) goto FAILED;
4830
776
          goto POST_ASSERTION;
4831
4832
776
          case META_LOOKBEHIND:
4833
450
          case META_LOOKBEHINDNOT:
4834
450
          case META_LOOKBEHIND_NA:
4835
450
          *parsed_pattern++ = meta;
4836
450
          ptr--;
4837
450
          goto POST_LOOKBEHIND;
4838
4839
          /* The script run facilities are handled here. Unicode support is
4840
          required (give an error if not, as this is a security issue). Always
4841
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4842
          META_ATOMIC and remember that we need two META_KETs at the end. */
4843
4844
2.40k
          case META_SCRIPT_RUN:
4845
3.17k
          case META_ATOMIC_SCRIPT_RUN:
4846
3.17k
#ifdef SUPPORT_UNICODE
4847
3.17k
          *parsed_pattern++ = META_SCRIPT_RUN;
4848
3.17k
          nest_depth++;
4849
3.17k
          ptr++;
4850
3.17k
          if (meta == META_ATOMIC_SCRIPT_RUN)
4851
766
            {
4852
766
            *parsed_pattern++ = META_ATOMIC;
4853
766
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4854
249
            else if (++top_nest >= end_nests)
4855
0
              {
4856
0
              errorcode = ERR84;
4857
0
              goto FAILED;
4858
0
              }
4859
766
            top_nest->nest_depth = nest_depth;
4860
766
            top_nest->flags = NSF_ATOMICSR;
4861
766
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4862
766
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4863
4864
#ifdef PCRE2_DEBUG
4865
            /* We'll write out two META_KETs for a single ")" in the input
4866
            pattern, so we reserve space for that in our bounds check. */
4867
            parsed_pattern_extra++;
4868
#endif
4869
766
            }
4870
3.17k
          break;
4871
#else  /* SUPPORT_UNICODE */
4872
          errorcode = ERR96;
4873
          goto FAILED;
4874
#endif
4875
4.66k
          }
4876
4.66k
        }
4877
4878
4879
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4880
4881
21.6k
      else
4882
21.6k
        {
4883
21.6k
        vn = verbnames;
4884
21.6k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4885
21.6k
          &errorcode, cb)) goto FAILED;
4886
21.6k
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4887
14.4k
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4888
55
          {
4889
55
          errorcode = ERR60;  /* Malformed */
4890
55
          goto FAILED;
4891
55
          }
4892
4893
        /* Scan the table of verb names */
4894
4895
119k
        for (i = 0; i < verbcount; i++)
4896
119k
          {
4897
119k
          if (namelen == verbs[i].len &&
4898
43.7k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4899
21.5k
            break;
4900
97.5k
          vn += verbs[i].len + 1;
4901
97.5k
          }
4902
4903
21.5k
        if (i >= verbcount)
4904
14
          {
4905
14
          errorcode = ERR60;  /* Verb not recognized */
4906
14
          goto FAILED;
4907
14
          }
4908
4909
        /* An empty argument is treated as no argument. */
4910
4911
21.5k
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4912
7.13k
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4913
68
          ptr++;    /* Advance to the closing parens */
4914
4915
        /* Check for mandatory non-empty argument; this is (*MARK) */
4916
4917
21.5k
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4918
3
          {
4919
3
          errorcode = ERR66;
4920
3
          goto FAILED;
4921
3
          }
4922
4923
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4924
        for handling quantified (*ACCEPT). */
4925
4926
21.5k
        verbstartptr = parsed_pattern;
4927
21.5k
        okquantifier = (verbs[i].meta == META_ACCEPT);
4928
#ifdef PCRE2_DEBUG
4929
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4930
        with a non-capturing bracket, if there is a following quantifier. */
4931
        if (okquantifier) parsed_pattern_extra += 2;
4932
#endif
4933
4934
        /* It appears that Perl allows any characters whatsoever, other than a
4935
        closing parenthesis, to appear in arguments ("names"), so we no longer
4936
        insist on letters, digits, and underscores. Perl does not, however, do
4937
        any interpretation within arguments, and has no means of including a
4938
        closing parenthesis. PCRE supports escape processing but only when it
4939
        is requested by an option. We set inverbname TRUE here, and let the
4940
        main loop take care of this so that escape and \x processing is done by
4941
        the main code above. */
4942
4943
21.5k
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4944
7.07k
          {
4945
          /* Some optional arguments can be treated as a preceding (*MARK) */
4946
4947
7.07k
          if (verbs[i].has_arg < 0)
4948
644
            {
4949
644
            add_after_mark = verbs[i].meta;
4950
644
            *parsed_pattern++ = META_MARK;
4951
644
            }
4952
4953
          /* The remaining verbs with arguments (except *MARK) need a different
4954
          opcode. */
4955
4956
6.43k
          else
4957
6.43k
            {
4958
6.43k
            *parsed_pattern++ = verbs[i].meta +
4959
6.43k
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4960
6.43k
            }
4961
4962
          /* Set up for reading the name in the main loop. */
4963
4964
7.07k
          verblengthptr = parsed_pattern++;
4965
7.07k
          verbnamestart = ptr;
4966
7.07k
          inverbname = TRUE;
4967
7.07k
          }
4968
14.5k
        else  /* No verb "name" argument */
4969
14.5k
          {
4970
14.5k
          *parsed_pattern++ = verbs[i].meta;
4971
14.5k
          }
4972
21.5k
        }     /* End of (*VERB) handling */
4973
154k
      break;  /* Done with this parenthesis */
4974
155k
      }       /* End of groups that don't start with (? */
4975
4976
4977
    /* ---- Items starting (? ---- */
4978
4979
    /* The type of item is determined by what follows (?. Handle (?| and option
4980
    changes under "default" because both need a new block on the nest stack.
4981
    Comments starting with (?# are handled above. Note that there is some
4982
    ambiguity about the sequence (?- because if a digit follows it's a relative
4983
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4984
4985
122k
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4986
4987
122k
    switch(*ptr)
4988
122k
      {
4989
13.2k
      default:
4990
13.2k
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4991
3.00k
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4992
4993
      /* We now have either (?| or a (possibly empty) option setting,
4994
      optionally followed by a non-capturing group. */
4995
4996
10.2k
      nest_depth++;
4997
10.2k
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4998
6.84k
      else if (++top_nest >= end_nests)
4999
0
        {
5000
0
        errorcode = ERR84;
5001
0
        goto FAILED;
5002
0
        }
5003
10.2k
      top_nest->nest_depth = nest_depth;
5004
10.2k
      top_nest->flags = 0;
5005
10.2k
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
5006
10.2k
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5007
5008
      /* Start of non-capturing group that resets the capture count for each
5009
      branch. */
5010
5011
10.2k
      if (*ptr == CHAR_VERTICAL_LINE)
5012
2.49k
        {
5013
2.49k
        top_nest->reset_group = (uint16_t)cb->bracount;
5014
2.49k
        top_nest->max_group = (uint16_t)cb->bracount;
5015
2.49k
        top_nest->flags |= NSF_RESET;
5016
2.49k
        cb->external_flags |= PCRE2_DUPCAPUSED;
5017
2.49k
        *parsed_pattern++ = META_NOCAPTURE;
5018
2.49k
        ptr++;
5019
2.49k
        }
5020
5021
      /* Scan for options imnrsxJU to be set or unset. */
5022
5023
7.76k
      else
5024
7.76k
        {
5025
7.76k
        BOOL hyphenok = TRUE;
5026
7.76k
        uint32_t oldoptions = options;
5027
7.76k
        uint32_t oldxoptions = xoptions;
5028
5029
7.76k
        top_nest->reset_group = 0;
5030
7.76k
        top_nest->max_group = 0;
5031
7.76k
        set = unset = 0;
5032
7.76k
        optset = &set;
5033
7.76k
        xset = xunset = 0;
5034
7.76k
        xoptset = &xset;
5035
5036
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
5037
5038
7.76k
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
5039
208
          {
5040
208
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
5041
208
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
5042
208
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
5043
208
          hyphenok = FALSE;
5044
208
          ptr++;
5045
208
          }
5046
5047
16.2k
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
5048
11.7k
                               *ptr != CHAR_COLON)
5049
8.60k
          {
5050
8.60k
          switch (*ptr++)
5051
8.60k
            {
5052
659
            case CHAR_MINUS:
5053
659
            if (!hyphenok)
5054
3
              {
5055
3
              errorcode = ERR94;
5056
3
              goto FAILED;
5057
3
              }
5058
656
            optset = &unset;
5059
656
            xoptset = &xunset;
5060
656
            hyphenok = FALSE;
5061
656
            break;
5062
5063
            /* There are some two-character sequences that start with 'a'. */
5064
5065
1.56k
            case CHAR_a:
5066
1.56k
            if (ptr < ptrend)
5067
1.56k
              {
5068
1.56k
              if (*ptr == CHAR_D)
5069
198
                {
5070
198
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
5071
198
                ptr++;
5072
198
                break;
5073
198
                }
5074
1.36k
              if (*ptr == CHAR_P)
5075
194
                {
5076
194
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
5077
194
                ptr++;
5078
194
                break;
5079
194
                }
5080
1.17k
              if (*ptr == CHAR_S)
5081
194
                {
5082
194
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
5083
194
                ptr++;
5084
194
                break;
5085
194
                }
5086
976
              if (*ptr == CHAR_T)
5087
366
                {
5088
366
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
5089
366
                ptr++;
5090
366
                break;
5091
366
                }
5092
610
              if (*ptr == CHAR_W)
5093
196
                {
5094
196
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
5095
196
                ptr++;
5096
196
                break;
5097
196
                }
5098
610
              }
5099
417
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
5100
417
                        PCRE2_EXTRA_ASCII_BSW|
5101
417
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
5102
417
            break;
5103
5104
774
            case CHAR_J:  /* Record that it changed in the external options */
5105
774
            *optset |= PCRE2_DUPNAMES;
5106
774
            cb->external_flags |= PCRE2_JCHANGED;
5107
774
            break;
5108
5109
1.97k
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
5110
332
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
5111
200
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
5112
1.04k
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
5113
727
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
5114
636
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
5115
5116
            /* If x appears twice it sets the extended extended option. */
5117
5118
569
            case CHAR_x:
5119
569
            *optset |= PCRE2_EXTENDED;
5120
569
            if (ptr < ptrend && *ptr == CHAR_x)
5121
244
              {
5122
244
              *optset |= PCRE2_EXTENDED_MORE;
5123
244
              ptr++;
5124
244
              }
5125
569
            break;
5126
5127
124
            default:
5128
124
            errorcode = ERR11;
5129
124
            goto FAILED;
5130
8.60k
            }
5131
8.60k
          }
5132
5133
        /* If we are setting extended without extended-more, ensure that any
5134
        existing extended-more gets unset. Also, unsetting extended must also
5135
        unset extended-more. */
5136
5137
7.63k
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5138
7.38k
            (unset & PCRE2_EXTENDED) != 0)
5139
316
          unset |= PCRE2_EXTENDED_MORE;
5140
5141
7.63k
        options = (options | set) & (~unset);
5142
7.63k
        xoptions = (xoptions | xset) & (~xunset);
5143
5144
        /* If the options ended with ')' this is not the start of a nested
5145
        group with option changes, so the options change at this level.
5146
        In this case, if the previous level set up a nest block, discard the
5147
        one we have just created. Otherwise adjust it for the previous level.
5148
        If the options ended with ':' we are starting a non-capturing group,
5149
        possibly with an options setting. */
5150
5151
7.63k
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5152
7.52k
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5153
4.40k
          {
5154
4.40k
          nest_depth--;  /* This is not a nested group after all. */
5155
4.40k
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5156
3.11k
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5157
2.07k
          else top_nest->nest_depth = nest_depth;
5158
4.40k
          }
5159
3.11k
        else *parsed_pattern++ = META_NOCAPTURE;
5160
5161
        /* If nothing changed, no need to record. */
5162
5163
7.52k
        if (options != oldoptions || xoptions != oldxoptions)
5164
1.50k
          {
5165
1.50k
          *parsed_pattern++ = META_OPTIONS;
5166
1.50k
          *parsed_pattern++ = options;
5167
1.50k
          *parsed_pattern++ = xoptions;
5168
1.50k
          }
5169
7.52k
        }     /* End options processing */
5170
10.0k
      break;  /* End default case after (? */
5171
5172
5173
      /* ---- Python syntax support ---- */
5174
5175
10.0k
      case CHAR_P:
5176
224
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5177
5178
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5179
5180
221
      if (*ptr == CHAR_LESS_THAN_SIGN)
5181
66
        {
5182
66
        terminator = CHAR_GREATER_THAN_SIGN;
5183
66
        goto DEFINE_NAME;
5184
66
        }
5185
5186
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5187
      call. */
5188
5189
155
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5190
5191
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5192
      else after (?P is an error. */
5193
5194
89
      if (*ptr != CHAR_EQUALS_SIGN)
5195
14
        {
5196
14
        errorcode = ERR41;
5197
14
        goto FAILED_FORWARD;
5198
14
        }
5199
75
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5200
75
          &namelen, &errorcode, cb)) goto FAILED;
5201
66
      *parsed_pattern++ = META_BACKREF_BYNAME;
5202
66
      *parsed_pattern++ = namelen;
5203
66
      PUTOFFSET(offset, parsed_pattern);
5204
66
      okquantifier = TRUE;
5205
66
      break;   /* End of (?P processing */
5206
5207
5208
      /* ---- Recursion/subroutine calls by number ---- */
5209
5210
2.13k
      case CHAR_R:
5211
2.13k
      i = 0;         /* (?R) == (?R0) */
5212
2.13k
      ptr++;
5213
2.13k
      if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))
5214
8
        {
5215
8
        errorcode = ERR58;
5216
8
        goto FAILED;
5217
8
        }
5218
2.12k
      terminator = CHAR_NUL;
5219
2.12k
      goto SET_RECURSION;
5220
5221
      /* An item starting (?- followed by a digit comes here via the "default"
5222
      case because (?- followed by a non-digit is an options setting. */
5223
5224
203
      case CHAR_PLUS:
5225
203
      if (ptr + 1 >= ptrend)
5226
3
        {
5227
3
        ++ptr;
5228
3
        goto UNCLOSED_PARENTHESIS;
5229
3
        }
5230
200
      if (!IS_DIGIT(ptr[1]))
5231
6
        {
5232
6
        errorcode = ERR29;   /* Missing number */
5233
6
        ++ptr;
5234
6
        goto FAILED_FORWARD;
5235
6
        }
5236
194
      PCRE2_FALLTHROUGH /* Fall through */
5237
194
5238
20.3k
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5239
21.8k
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5240
24.8k
      RECURSION_BYNUMBER:
5241
24.8k
      if (!read_number(&ptr, ptrend,
5242
24.8k
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5243
24.8k
          MAX_GROUP_NUMBER, ERR61,
5244
24.8k
          &i, &errorcode)) goto FAILED;
5245
24.8k
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5246
24.8k
      terminator = CHAR_NUL;
5247
5248
27.2k
      SET_RECURSION:
5249
27.2k
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5250
27.2k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5251
      /* End of recursive call by number handling */
5252
27.2k
      goto READ_RECURSION_ARGUMENTS;
5253
5254
5255
      /* ---- Recursion/subroutine calls by name ---- */
5256
5257
703
      case CHAR_AMPERSAND:
5258
769
      RECURSE_BY_NAME:
5259
769
      if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,
5260
769
          &namelen, &errorcode, cb)) goto FAILED;
5261
747
      *parsed_pattern++ = META_RECURSE_BYNAME;
5262
747
      *parsed_pattern++ = namelen;
5263
747
      terminator = CHAR_NUL;
5264
5265
27.9k
      READ_RECURSION_ARGUMENTS:
5266
27.9k
      PUTOFFSET(offset, parsed_pattern);
5267
27.9k
      okquantifier = TRUE;
5268
5269
      /* Arguments are not supported for \g construct. */
5270
27.9k
      if (terminator != CHAR_NUL) break;
5271
5272
27.7k
      if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)
5273
8.83k
        {
5274
8.83k
        parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
5275
8.83k
                                            offset, &errorcode, cb);
5276
8.83k
        if (parsed_pattern == NULL) goto FAILED;
5277
8.83k
        }
5278
5279
27.6k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5280
126
        goto UNCLOSED_PARENTHESIS;
5281
5282
27.4k
      ptr++;
5283
27.4k
      break;
5284
5285
      /* ---- Callout with numerical or string argument ---- */
5286
5287
4.91k
      case CHAR_C:
5288
4.91k
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5289
0
        {
5290
0
        ptr++;
5291
0
        errorcode = ERR103;
5292
0
        goto FAILED;
5293
0
        }
5294
5295
4.91k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5296
5297
      /* If the previous item was a condition starting (?(? an assertion,
5298
      optionally preceded by a callout, is expected. This is checked later on,
5299
      during actual compilation. However we need to identify this kind of
5300
      assertion in this pass because it must not be qualified. The value of
5301
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5302
      for a callout - still leaving a positive value that identifies the
5303
      assertion. Multiple callouts or any other items will make it zero or
5304
      less, which doesn't matter because they will cause an error later. */
5305
5306
4.91k
      expect_cond_assert = prev_expect_cond_assert - 1;
5307
5308
      /* If previous_callout is not NULL, it means this follows a previous
5309
      callout. If it was a manual callout, do nothing; this means its "length
5310
      of next pattern item" field will remain zero. If it was an automatic
5311
      callout, abolish it. */
5312
5313
4.91k
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5314
1.67k
          previous_callout == parsed_pattern - 4 &&
5315
1.55k
          parsed_pattern[-1] == 255)
5316
1.32k
        parsed_pattern = previous_callout;
5317
5318
      /* Save for updating next pattern item length, and skip one item before
5319
      completing. */
5320
5321
4.91k
      previous_callout = parsed_pattern;
5322
4.91k
      after_manual_callout = 1;
5323
5324
      /* Handle a string argument; specific delimiter is required. */
5325
5326
4.91k
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5327
1.75k
        {
5328
1.75k
        PCRE2_SIZE calloutlength;
5329
1.75k
        PCRE2_SPTR startptr = ptr;
5330
5331
1.75k
        delimiter = 0;
5332
10.0k
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5333
10.0k
          {
5334
10.0k
          if (*ptr == PRIV(callout_start_delims)[i])
5335
1.72k
            {
5336
1.72k
            delimiter = PRIV(callout_end_delims)[i];
5337
1.72k
            break;
5338
1.72k
            }
5339
10.0k
          }
5340
1.75k
        if (delimiter == 0)
5341
33
          {
5342
33
          errorcode = ERR82;
5343
33
          goto FAILED_FORWARD;
5344
33
          }
5345
5346
1.72k
        *parsed_pattern = META_CALLOUT_STRING;
5347
1.72k
        parsed_pattern += 3;   /* Skip pattern info */
5348
5349
1.72k
        for (;;)
5350
10.7k
          {
5351
10.7k
          if (++ptr >= ptrend)
5352
55
            {
5353
55
            errorcode = ERR81;
5354
55
            ptr = startptr;   /* To give a more useful message */
5355
55
            goto FAILED;
5356
55
            }
5357
10.6k
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5358
1.67k
            break;
5359
10.6k
          }
5360
5361
1.67k
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5362
1.67k
        if (calloutlength > UINT32_MAX)
5363
0
          {
5364
0
          errorcode = ERR72;
5365
0
          goto FAILED;
5366
0
          }
5367
1.67k
        *parsed_pattern++ = (uint32_t)calloutlength;
5368
1.67k
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5369
1.67k
        PUTOFFSET(offset, parsed_pattern);
5370
1.67k
        }
5371
5372
      /* Handle a callout with an optional numerical argument, which must be
5373
      less than or equal to 255. A missing argument gives 0. */
5374
5375
3.15k
      else
5376
3.15k
        {
5377
3.15k
        int n = 0;
5378
3.15k
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5379
3.15k
        parsed_pattern += 3;                       /* Skip pattern info */
5380
3.76k
        while (ptr < ptrend && IS_DIGIT(*ptr))
5381
614
          {
5382
614
          n = n * 10 + (*ptr++ - CHAR_0);
5383
614
          if (n > 255)
5384
3
            {
5385
3
            errorcode = ERR38;
5386
3
            goto FAILED;
5387
3
            }
5388
614
          }
5389
3.15k
        *parsed_pattern++ = n;
5390
3.15k
        }
5391
5392
      /* Both formats must have a closing parenthesis */
5393
5394
4.82k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5395
28
        {
5396
28
        errorcode = ERR39;
5397
28
        goto FAILED;
5398
28
        }
5399
4.79k
      ptr++;
5400
5401
      /* Remember the offset to the next item in the pattern, and set a default
5402
      length. This should get updated after the next item is read. */
5403
5404
4.79k
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5405
4.79k
      previous_callout[2] = 0;
5406
4.79k
      break;                  /* End callout */
5407
5408
5409
      /* ---- Conditional group ---- */
5410
5411
      /* A condition can be an assertion, a number (referring to a numbered
5412
      group's having been set), a name (referring to a named group), or 'R',
5413
      referring to overall recursion. R<digits> and R&name are also permitted
5414
      for recursion state tests. Numbers may be preceded by + or - to specify a
5415
      relative group number.
5416
5417
      There are several syntaxes for testing a named group: (?(name)) is used
5418
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5419
5420
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5421
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5422
      the Perl DEFINE feature or the Python named test. We look for a name
5423
      first; if not found, we try the other case.
5424
5425
      For compatibility with auto-callouts, we allow a callout to be specified
5426
      before a condition that is an assertion. */
5427
5428
14.2k
      case CHAR_LEFT_PARENTHESIS:
5429
14.2k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5430
14.2k
      nest_depth++;
5431
5432
      /* If the next character is ? or * there must be an assertion next
5433
      (optionally preceded by a callout). We do not check this here, but
5434
      instead we set expect_cond_assert to 2. If this is still greater than
5435
      zero (callouts decrement it) when the next assertion is read, it will be
5436
      marked as a condition that must not be repeated. A value greater than
5437
      zero also causes checking that an assertion (possibly with callout)
5438
      follows. */
5439
5440
14.2k
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5441
7.86k
        {
5442
7.86k
        *parsed_pattern++ = META_COND_ASSERT;
5443
7.86k
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5444
7.86k
        expect_cond_assert = 2;
5445
7.86k
        break;  /* End of conditional */
5446
7.86k
        }
5447
5448
      /* Handle (?([+-]number)... */
5449
5450
6.37k
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5451
6.37k
          &errorcode))
5452
1.00k
        {
5453
1.00k
        PCRE2_ASSERT(i >= 0);
5454
1.00k
        if (i <= 0)
5455
3
          {
5456
3
          errorcode = ERR15;
5457
3
          goto FAILED;
5458
3
          }
5459
1.00k
        *parsed_pattern++ = META_COND_NUMBER;
5460
1.00k
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5461
1.00k
        PUTOFFSET(offset, parsed_pattern);
5462
1.00k
        *parsed_pattern++ = i;
5463
1.00k
        }
5464
5.36k
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5465
5466
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5467
5468
5.36k
      else if (ptrend - ptr >= 10 &&
5469
5.12k
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5470
0
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5471
0
        {
5472
0
        uint32_t ge = 0;
5473
0
        int major = 0;
5474
0
        int minor = 0;
5475
5476
0
        ptr += 7;
5477
0
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5478
0
          {
5479
0
          ge = 1;
5480
0
          ptr++;
5481
0
          }
5482
5483
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5484
        references its argument twice. */
5485
5486
0
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5487
0
          {
5488
0
          errorcode = ERR79;
5489
0
          if (!ge) goto FAILED_FORWARD;
5490
0
          goto FAILED;
5491
0
          }
5492
5493
0
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5494
0
          goto FAILED;
5495
5496
0
        if (ptr < ptrend && *ptr == CHAR_DOT)
5497
0
          {
5498
0
          if (++ptr >= ptrend || !IS_DIGIT(*ptr))
5499
0
            {
5500
0
            errorcode = ERR79;
5501
0
            if (ptr < ptrend) goto FAILED_FORWARD;
5502
0
            goto FAILED;
5503
0
            }
5504
0
          if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &minor, &errorcode))
5505
0
            goto FAILED;
5506
0
          }
5507
0
        if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5508
0
          {
5509
0
          errorcode = ERR79;
5510
0
          if (ptr < ptrend) goto FAILED_FORWARD;
5511
0
          goto FAILED;
5512
0
          }
5513
5514
0
        *parsed_pattern++ = META_COND_VERSION;
5515
0
        *parsed_pattern++ = ge;
5516
0
        *parsed_pattern++ = major;
5517
0
        *parsed_pattern++ = minor;
5518
0
        }
5519
5520
      /* All the remaining cases now require us to read a name. We cannot at
5521
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5522
      recursion test by number or a name, because the named groups have not yet
5523
      all been identified. Those cases are treated as names, but given a
5524
      different META code. */
5525
5526
5.36k
      else
5527
5.36k
        {
5528
5.36k
        BOOL was_r_ampersand = FALSE;
5529
5530
5.36k
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5531
275
          {
5532
275
          terminator = CHAR_RIGHT_PARENTHESIS;
5533
275
          was_r_ampersand = TRUE;
5534
275
          ptr++;
5535
275
          }
5536
5.08k
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5537
1.43k
          terminator = CHAR_GREATER_THAN_SIGN;
5538
3.65k
        else if (*ptr == CHAR_APOSTROPHE)
5539
3
          terminator = CHAR_APOSTROPHE;
5540
3.64k
        else
5541
3.64k
          {
5542
3.64k
          terminator = CHAR_RIGHT_PARENTHESIS;
5543
3.64k
          ptr--;   /* Point to char before name */
5544
3.64k
          }
5545
5546
5.36k
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5547
5.36k
            &errorcode, cb)) goto FAILED;
5548
5549
        /* Handle (?(R&name) */
5550
5551
5.23k
        if (was_r_ampersand)
5552
245
          {
5553
245
          *parsed_pattern = META_COND_RNAME;
5554
245
          ptr--;   /* Back to closing parens */
5555
245
          }
5556
5557
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5558
        special code. Likewise if the name consists of R followed only by
5559
        digits. Otherwise, handle it like a quoted name. */
5560
5561
4.99k
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5562
3.56k
          {
5563
3.56k
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5564
0
            *parsed_pattern = META_COND_DEFINE;
5565
3.56k
          else
5566
3.56k
            {
5567
6.49k
            for (i = 1; i < (int)namelen; i++)
5568
3.21k
              if (!IS_DIGIT(name[i])) break;
5569
3.56k
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5570
2.40k
              META_COND_RNUMBER : META_COND_NAME;
5571
3.56k
            }
5572
3.56k
          ptr--;   /* Back to closing parens */
5573
3.56k
          }
5574
5575
        /* Handle (?('name') or (?(<name>) */
5576
5577
1.42k
        else *parsed_pattern = META_COND_NAME;
5578
5579
        /* All these cases except DEFINE end with the name length and offset;
5580
        DEFINE just has an offset (for the "too many branches" error). */
5581
5582
5.23k
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5583
5.23k
        PUTOFFSET(offset, parsed_pattern);
5584
5.23k
        }  /* End cases that read a name */
5585
5586
      /* Check the closing parenthesis of the condition */
5587
5588
6.23k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5589
12
        {
5590
12
        errorcode = ERR24;
5591
12
        goto FAILED;
5592
12
        }
5593
6.22k
      ptr++;
5594
6.22k
      break;  /* End of condition processing */
5595
5596
5597
      /* ---- Atomic group ---- */
5598
5599
5.07k
      case CHAR_GREATER_THAN_SIGN:
5600
5.07k
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5601
5.07k
      *parsed_pattern++ = META_ATOMIC;
5602
5.07k
      nest_depth++;
5603
5.07k
      ptr++;
5604
5.07k
      break;
5605
5606
5607
      /* ---- Lookahead assertions ---- */
5608
5609
10.0k
      case CHAR_EQUALS_SIGN:
5610
10.1k
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5611
10.1k
      *parsed_pattern++ = META_LOOKAHEAD;
5612
10.1k
      ptr++;
5613
10.1k
      goto POST_ASSERTION;
5614
5615
12.1k
      case CHAR_ASTERISK:
5616
12.1k
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5617
12.1k
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5618
12.1k
      ptr++;
5619
12.1k
      goto POST_ASSERTION;
5620
5621
8.12k
      case CHAR_EXCLAMATION_MARK:
5622
8.31k
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5623
8.31k
      *parsed_pattern++ = META_LOOKAHEADNOT;
5624
8.31k
      ptr++;
5625
8.31k
      goto POST_ASSERTION;
5626
5627
5628
      /* ---- Lookbehind assertions ---- */
5629
5630
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5631
      is the start of the name of a capturing group. */
5632
5633
28.3k
      case CHAR_LESS_THAN_SIGN:
5634
28.3k
      if (ptrend - ptr <= 1 ||
5635
28.3k
         (ptr[1] != CHAR_EQUALS_SIGN &&
5636
20.0k
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5637
15.2k
          ptr[1] != CHAR_ASTERISK))
5638
13.0k
        {
5639
13.0k
        terminator = CHAR_GREATER_THAN_SIGN;
5640
13.0k
        goto DEFINE_NAME;
5641
13.0k
        }
5642
15.2k
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5643
8.27k
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5644
4.83k
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5645
5646
15.6k
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5647
15.6k
      *has_lookbehind = TRUE;
5648
15.6k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5649
15.6k
      PUTOFFSET(offset, parsed_pattern);
5650
15.6k
      ptr += 2;
5651
      /* Fall through */
5652
5653
      /* If the previous item was a condition starting (?(? an assertion,
5654
      optionally preceded by a callout, is expected. This is checked later on,
5655
      during actual compilation. However we need to identify this kind of
5656
      assertion in this pass because it must not be qualified. The value of
5657
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5658
      for a callout - still leaving a positive value that identifies the
5659
      assertion. Multiple callouts or any other items will make it zero or
5660
      less, which doesn't matter because they will cause an error later. */
5661
5662
47.0k
      POST_ASSERTION:
5663
47.0k
      nest_depth++;
5664
47.0k
      if (prev_expect_cond_assert > 0)
5665
7.78k
        {
5666
7.78k
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5667
1.20k
        else if (++top_nest >= end_nests)
5668
0
          {
5669
0
          errorcode = ERR84;
5670
0
          goto FAILED;
5671
0
          }
5672
7.78k
        top_nest->nest_depth = nest_depth;
5673
7.78k
        top_nest->flags = NSF_CONDASSERT;
5674
7.78k
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5675
7.78k
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5676
7.78k
        }
5677
47.0k
      break;
5678
5679
5680
      /* ---- Define a named group ---- */
5681
5682
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5683
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5684
      terminator set to '>'. */
5685
5686
47.0k
      case CHAR_APOSTROPHE:
5687
336
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5688
5689
13.4k
      DEFINE_NAME:
5690
13.4k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5691
13.4k
          &errorcode, cb)) goto FAILED;
5692
5693
      /* We have a name for this capturing group. It is also assigned a number,
5694
      which is its primary means of identification. */
5695
5696
13.3k
      if (cb->bracount >= MAX_GROUP_NUMBER)
5697
0
        {
5698
0
        errorcode = ERR97;
5699
0
        goto FAILED;
5700
0
        }
5701
13.3k
      cb->bracount++;
5702
13.3k
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5703
13.3k
      nest_depth++;
5704
5705
      /* Check not too many names */
5706
5707
13.3k
      if (cb->names_found >= MAX_NAME_COUNT)
5708
0
        {
5709
0
        errorcode = ERR49;
5710
0
        goto FAILED;
5711
0
        }
5712
5713
      /* Adjust the entry size to accommodate the longest name found. */
5714
5715
13.3k
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5716
2.38k
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5717
5718
      /* Scan the list to check for duplicates. For duplicate names, if the
5719
      number is the same, break the loop, which causes the name to be
5720
      discarded; otherwise, if DUPNAMES is not set, give an error.
5721
      If it is set, allow the name with a different number, but continue
5722
      scanning in case this is a duplicate with the same number. For
5723
      non-duplicate names, give an error if the number is duplicated. */
5724
5725
13.3k
      is_dupname = FALSE;
5726
13.3k
      hash = PRIV(compile_get_hash_from_name)(name, namelen);
5727
13.3k
      ng = cb->named_groups;
5728
23.4k
      for (i = 0; i < cb->names_found; i++, ng++)
5729
20.5k
        {
5730
20.5k
        if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&
5731
11.6k
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5732
10.3k
          {
5733
          /* When a bracket is referenced by the same name multiple
5734
          times, is not considered as a duplicate and ignored. */
5735
10.3k
          if (ng->number == cb->bracount) break;
5736
10.2k
          if ((options & PCRE2_DUPNAMES) == 0)
5737
745
            {
5738
745
            errorcode = ERR43;
5739
745
            goto FAILED;
5740
745
            }
5741
5742
9.53k
          ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;
5743
9.53k
          is_dupname = TRUE;                /* Mark as a duplicate */
5744
9.53k
          cb->dupnames = TRUE;              /* Duplicate names exist */
5745
5746
          /* The entry represents a duplicate. */
5747
9.53k
          name = ng->name;
5748
9.53k
          namelen = 0;
5749
5750
          /* Even duplicated names may refer to the same
5751
          capture index. These references are also ignored. */
5752
987k
          for (; i < cb->names_found; i++, ng++)
5753
978k
            if (ng->name == name && ng->number == cb->bracount)
5754
349
              break;
5755
9.53k
          break;
5756
10.2k
          }
5757
10.1k
        else if (ng->number == cb->bracount)
5758
3
          {
5759
3
          errorcode = ERR65;
5760
3
          goto FAILED;
5761
3
          }
5762
20.5k
        }
5763
5764
      /* Ignore duplicate with same number. */
5765
12.5k
      if (i < cb->names_found) break;
5766
5767
      /* Increase the list size if necessary */
5768
5769
12.1k
      if (cb->names_found >= cb->named_group_list_size)
5770
169
        {
5771
169
        uint32_t newsize = cb->named_group_list_size * 2;
5772
169
        named_group *newspace =
5773
169
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5774
169
          cb->cx->memctl.memory_data);
5775
169
        if (newspace == NULL)
5776
0
          {
5777
0
          errorcode = ERR21;
5778
0
          goto FAILED;
5779
0
          }
5780
5781
169
        memcpy(newspace, cb->named_groups,
5782
169
          cb->named_group_list_size * sizeof(named_group));
5783
169
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5784
94
          cb->cx->memctl.free((void *)cb->named_groups,
5785
94
          cb->cx->memctl.memory_data);
5786
169
        cb->named_groups = newspace;
5787
169
        cb->named_group_list_size = newsize;
5788
169
        }
5789
5790
      /* Add this name to the list */
5791
12.1k
      if (is_dupname)
5792
9.18k
        hash |= NAMED_GROUP_IS_DUPNAME;
5793
5794
12.1k
      cb->named_groups[cb->names_found].name = name;
5795
12.1k
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5796
12.1k
      cb->named_groups[cb->names_found].number = cb->bracount;
5797
12.1k
      cb->named_groups[cb->names_found].hash_dup = hash;
5798
12.1k
      cb->names_found++;
5799
12.1k
      break;
5800
5801
5802
      /* ---- Perl extended character class ---- */
5803
5804
      /* These are of the form '(?[...])'. We handle these via the same parser
5805
      that consumes ordinary '[...]' classes, but with a flag set to activate
5806
      the extended behaviour. */
5807
5808
961
      case CHAR_LEFT_SQUARE_BRACKET:
5809
961
      class_mode_state = CLASS_MODE_PERL_EXT;
5810
961
      c = *ptr++;
5811
961
      goto FROM_PERL_EXTENDED_CLASS;
5812
122k
      }        /* End of (? switch */
5813
121k
    break;     /* End of ( handling */
5814
5815
5816
    /* ---- Branch terminators ---- */
5817
5818
    /* Alternation: reset the capture count if we are in a (?| group. */
5819
5820
121k
    case CHAR_VERTICAL_LINE:
5821
115k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5822
9.15k
        (top_nest->flags & NSF_RESET) != 0)
5823
4.10k
      {
5824
4.10k
      if (cb->bracount > top_nest->max_group)
5825
777
        top_nest->max_group = (uint16_t)cb->bracount;
5826
4.10k
      cb->bracount = top_nest->reset_group;
5827
4.10k
      }
5828
115k
    *parsed_pattern++ = META_ALT;
5829
115k
    break;
5830
5831
    /* End of group; reset the capture count to the maximum if we are in a (?|
5832
    group and/or reset the options that are tracked during parsing. Disallow
5833
    quantifier for a condition that is an assertion. */
5834
5835
203k
    case CHAR_RIGHT_PARENTHESIS:
5836
203k
    okquantifier = TRUE;
5837
203k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5838
11.8k
      {
5839
11.8k
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5840
11.8k
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5841
11.8k
      if ((top_nest->flags & NSF_RESET) != 0 &&
5842
1.91k
          top_nest->max_group > cb->bracount)
5843
340
        cb->bracount = top_nest->max_group;
5844
11.8k
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5845
6.91k
        okquantifier = FALSE;
5846
5847
11.8k
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5848
535
        {
5849
535
        *parsed_pattern++ = META_KET;
5850
5851
#ifdef PCRE2_DEBUG
5852
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5853
        parsed_pattern_extra--;
5854
#endif
5855
535
        }
5856
5857
11.8k
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5858
2.66k
        else top_nest--;
5859
11.8k
      }
5860
203k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5861
254
      {
5862
254
      errorcode = ERR22;
5863
254
      goto FAILED;
5864
254
      }
5865
203k
    nest_depth--;
5866
203k
    *parsed_pattern++ = META_KET;
5867
203k
    break;
5868
7.32M
    }  /* End of switch on pattern character */
5869
7.32M
  }    /* End of main character scan loop */
5870
5871
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5872
5873
76.2k
if (inverbname && ptr >= ptrend)
5874
213
  {
5875
213
  errorcode = ERR60;
5876
213
  goto FAILED;
5877
213
  }
5878
5879
5880
76.0k
PARSED_END:
5881
5882
76.0k
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5883
76.0k
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5884
76.0k
               max_parsed_pattern(ptr_check, ptr, utf, options));
5885
5886
/* Manage callout for the final item */
5887
5888
76.0k
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5889
76.0k
  parsed_pattern, cb);
5890
5891
/* Insert trailing items for word and line matching (features provided for the
5892
benefit of pcre2grep). */
5893
5894
76.0k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5895
0
  {
5896
0
  *parsed_pattern++ = META_KET;
5897
0
  *parsed_pattern++ = META_DOLLAR;
5898
0
  }
5899
76.0k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5900
0
  {
5901
0
  *parsed_pattern++ = META_KET;
5902
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5903
0
  }
5904
5905
/* Terminate the parsed pattern, then return success if all groups are closed.
5906
Otherwise we have unclosed parentheses. */
5907
5908
/* LCOV_EXCL_START */
5909
76.0k
if (parsed_pattern >= parsed_pattern_end)
5910
0
  {
5911
0
  PCRE2_DEBUG_UNREACHABLE();
5912
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5913
0
  goto FAILED;
5914
0
  }
5915
/* LCOV_EXCL_STOP */
5916
5917
76.0k
*parsed_pattern = META_END;
5918
76.0k
if (nest_depth == 0) return 0;
5919
5920
1.21k
UNCLOSED_PARENTHESIS:
5921
1.21k
errorcode = ERR14;
5922
5923
/* Come here for all failures. */
5924
5925
7.06k
FAILED:
5926
7.06k
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5927
7.06k
return errorcode;
5928
5929
/* Some errors need to indicate the previous character. */
5930
5931
19
FAILED_BACK:
5932
19
ptr--;
5933
19
#ifdef SUPPORT_UNICODE
5934
19
if (utf) BACKCHAR(ptr);
5935
19
#endif
5936
19
goto FAILED;
5937
5938
/* Some errors need to indicate the next character. */
5939
5940
76
FAILED_FORWARD:
5941
76
ptr++;
5942
76
#ifdef SUPPORT_UNICODE
5943
76
if (utf) FORWARDCHARTEST(ptr, ptrend);
5944
76
#endif
5945
76
goto FAILED;
5946
1.21k
}
5947
5948
5949
5950
/*************************************************
5951
*       Find first significant opcode            *
5952
*************************************************/
5953
5954
/* This is called by several functions that scan a compiled expression looking
5955
for a fixed first character, or an anchoring opcode etc. It skips over things
5956
that do not influence this. For some calls, it makes sense to skip negative
5957
forward and all backward assertions, and also the \b assertion; for others it
5958
does not.
5959
5960
Arguments:
5961
  code         pointer to the start of the group
5962
  skipassert   TRUE if certain assertions are to be skipped
5963
5964
Returns:       pointer to the first significant opcode
5965
*/
5966
5967
static const PCRE2_UCHAR*
5968
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5969
189k
{
5970
189k
for (;;)
5971
400k
  {
5972
400k
  switch ((int)*code)
5973
400k
    {
5974
59.5k
    case OP_ASSERT_NOT:
5975
93.2k
    case OP_ASSERTBACK:
5976
132k
    case OP_ASSERTBACK_NOT:
5977
183k
    case OP_ASSERTBACK_NA:
5978
183k
    if (!skipassert) return code;
5979
226k
    do code += GET(code, 1); while (*code == OP_ALT);
5980
179k
    code += PRIV(OP_lengths)[*code];
5981
179k
    break;
5982
5983
393
    case OP_WORD_BOUNDARY:
5984
842
    case OP_NOT_WORD_BOUNDARY:
5985
1.01k
    case OP_UCP_WORD_BOUNDARY:
5986
1.18k
    case OP_NOT_UCP_WORD_BOUNDARY:
5987
1.18k
    if (!skipassert) return code;
5988
585
    PCRE2_FALLTHROUGH /* Fall through */
5989
585
5990
29.8k
    case OP_CALLOUT:
5991
29.8k
    case OP_CREF:
5992
29.8k
    case OP_DNCREF:
5993
29.8k
    case OP_RREF:
5994
29.8k
    case OP_DNRREF:
5995
29.8k
    case OP_FALSE:
5996
29.8k
    case OP_TRUE:
5997
29.8k
    code += PRIV(OP_lengths)[*code];
5998
29.8k
    break;
5999
6000
217
    case OP_CALLOUT_STR:
6001
217
    code += GET(code, 1 + 2*LINK_SIZE);
6002
217
    break;
6003
6004
256
    case OP_SKIPZERO:
6005
256
    code += 2 + GET(code, 2) + LINK_SIZE;
6006
256
    break;
6007
6008
4.95k
    case OP_COND:
6009
5.07k
    case OP_SCOND:
6010
5.07k
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
6011
0
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
6012
5.07k
      return code;
6013
0
    code += GET(code, 1) + 1 + LINK_SIZE;
6014
0
    break;
6015
6016
682
    case OP_MARK:
6017
804
    case OP_COMMIT_ARG:
6018
907
    case OP_PRUNE_ARG:
6019
1.23k
    case OP_SKIP_ARG:
6020
1.47k
    case OP_THEN_ARG:
6021
1.47k
    code += code[1] + PRIV(OP_lengths)[*code];
6022
1.47k
    break;
6023
6024
179k
    default:
6025
179k
    return code;
6026
400k
    }
6027
400k
  }
6028
6029
/* LCOV_EXCL_START */
6030
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6031
/* LCOV_EXCL_STOP */
6032
0
}
6033
6034
6035
6036
/*************************************************
6037
*           Compile one branch                   *
6038
*************************************************/
6039
6040
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
6041
the options are changed during the branch, the pointer is used to change the
6042
external options bits. This function is used during the pre-compile phase when
6043
we are trying to find out the amount of memory needed, as well as during the
6044
real compile phase. The value of lengthptr distinguishes the two phases.
6045
6046
Arguments:
6047
  optionsptr        pointer to the option bits
6048
  xoptionsptr       pointer to the extra option bits
6049
  codeptr           points to the pointer to the current code point
6050
  pptrptr           points to the current parsed pattern pointer
6051
  errorcodeptr      points to error code variable
6052
  firstcuptr        place to put the first required code unit
6053
  firstcuflagsptr   place to put the first code unit flags
6054
  reqcuptr          place to put the last required code unit
6055
  reqcuflagsptr     place to put the last required code unit flags
6056
  bcptr             points to current branch chain
6057
  open_caps         points to current capitem
6058
  cb                contains pointers to tables etc.
6059
  lengthptr         NULL during the real compile phase
6060
                    points to length accumulator during pre-compile phase
6061
6062
Returns:            0 There's been an error, *errorcodeptr is non-zero
6063
                   +1 Success, this branch must match at least one character
6064
                   -1 Success, this branch may match an empty string
6065
*/
6066
6067
static int
6068
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
6069
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
6070
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
6071
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
6072
  compile_block *cb, PCRE2_SIZE *lengthptr)
6073
662k
{
6074
662k
int bravalue = 0;
6075
662k
int okreturn = -1;
6076
662k
int group_return = 0;
6077
662k
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
6078
662k
uint32_t greedy_default, greedy_non_default;
6079
662k
uint32_t repeat_type, op_type;
6080
662k
uint32_t options = *optionsptr;               /* May change dynamically */
6081
662k
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
6082
662k
uint32_t firstcu, reqcu;
6083
662k
uint32_t zeroreqcu, zerofirstcu;
6084
662k
uint32_t *pptr = *pptrptr;
6085
662k
uint32_t meta, meta_arg;
6086
662k
uint32_t firstcuflags, reqcuflags;
6087
662k
uint32_t zeroreqcuflags, zerofirstcuflags;
6088
662k
uint32_t req_caseopt, reqvary, tempreqvary;
6089
/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,
6090
depends on the previous value of offset. */
6091
662k
PCRE2_SIZE offset = 0;
6092
662k
PCRE2_SIZE length_prevgroup = 0;
6093
662k
PCRE2_UCHAR *code = *codeptr;
6094
662k
PCRE2_UCHAR *last_code = code;
6095
662k
PCRE2_UCHAR *orig_code = code;
6096
662k
PCRE2_UCHAR *tempcode;
6097
662k
PCRE2_UCHAR *previous = NULL;
6098
662k
PCRE2_UCHAR op_previous;
6099
662k
BOOL groupsetfirstcu = FALSE;
6100
662k
BOOL had_accept = FALSE;
6101
662k
BOOL matched_char = FALSE;
6102
662k
BOOL previous_matched_char = FALSE;
6103
662k
BOOL reset_caseful = FALSE;
6104
6105
/* We can fish out the UTF setting once and for all into a BOOL, but we must
6106
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
6107
as we process the pattern. */
6108
6109
662k
#ifdef SUPPORT_UNICODE
6110
662k
BOOL utf = (options & PCRE2_UTF) != 0;
6111
662k
BOOL ucp = (options & PCRE2_UCP) != 0;
6112
#else  /* No Unicode support */
6113
BOOL utf = FALSE;
6114
#endif
6115
6116
/* Set up the default and non-default settings for greediness */
6117
6118
662k
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6119
662k
greedy_non_default = greedy_default ^ 1;
6120
6121
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6122
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6123
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6124
6125
When we hit a repeat whose minimum is zero, we may have to adjust these values
6126
to take the zero repeat into account. This is implemented by setting them to
6127
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6128
item types that can be repeated set these backoff variables appropriately. */
6129
6130
662k
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6131
662k
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6132
6133
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6134
according to the current setting of the caseless flag. The REQ_CASELESS value
6135
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6136
to record the case status of the value. This is used only for ASCII characters.
6137
*/
6138
6139
662k
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6140
6141
/* Switch on next META item until the end of the branch */
6142
6143
11.7M
for (;; pptr++)
6144
12.4M
  {
6145
12.4M
  BOOL possessive_quantifier;
6146
12.4M
  BOOL note_group_empty;
6147
12.4M
  uint32_t mclength;
6148
12.4M
  uint32_t skipunits;
6149
12.4M
  uint32_t subreqcu, subfirstcu;
6150
12.4M
  uint32_t groupnumber;
6151
12.4M
  uint32_t verbarglen, verbculen;
6152
12.4M
  uint32_t subreqcuflags, subfirstcuflags;
6153
12.4M
  open_capitem *oc;
6154
12.4M
  PCRE2_UCHAR mcbuffer[8];
6155
6156
  /* Get next META item in the pattern and its potential argument. */
6157
6158
12.4M
  meta = META_CODE(*pptr);
6159
12.4M
  meta_arg = META_DATA(*pptr);
6160
6161
  /* If we are in the pre-compile phase, accumulate the length used for the
6162
  previous cycle of this loop, unless the next item is a quantifier. */
6163
6164
12.4M
  if (lengthptr != NULL)
6165
6.40M
    {
6166
    /* LCOV_EXCL_START */
6167
6.40M
    if (code >= cb->start_workspace + cb->workspace_size)
6168
0
      {
6169
0
      PCRE2_DEBUG_UNREACHABLE();
6170
0
      *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6171
0
      cb->erroroffset = 0;
6172
0
      return 0;
6173
0
      }
6174
    /* LCOV_EXCL_STOP */
6175
6176
6.40M
    if (code > cb->start_workspace + cb->workspace_size -
6177
6.40M
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6178
0
      {
6179
0
      *errorcodeptr = ERR86;  /* Pattern too complicated */
6180
0
      cb->erroroffset = 0;
6181
0
      return 0;
6182
0
      }
6183
6184
    /* There is at least one situation where code goes backwards: this is the
6185
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6186
    is processed, the whole class is eliminated. However, it is created first,
6187
    so we have to allow memory for it. Therefore, don't ever reduce the length
6188
    at this point. */
6189
6190
6.40M
    if (code < last_code) code = last_code;
6191
6192
    /* If the next thing is not a quantifier, we add the length of the previous
6193
    item into the total, and reset the code pointer to the start of the
6194
    workspace. Otherwise leave the previous item available to be quantified. */
6195
6196
6.40M
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6197
6.09M
      {
6198
6.09M
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6199
0
        {
6200
0
        *errorcodeptr = ERR20;   /* Integer overflow */
6201
0
        cb->erroroffset = 0;
6202
0
        return 0;
6203
0
        }
6204
6.09M
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6205
6.09M
      if (*lengthptr > MAX_PATTERN_SIZE)
6206
265
        {
6207
265
        *errorcodeptr = ERR20;   /* Pattern is too large */
6208
265
        cb->erroroffset = 0;
6209
265
        return 0;
6210
265
        }
6211
6.09M
      code = orig_code;
6212
6.09M
      }
6213
6214
    /* Remember where this code item starts so we can catch the "backwards"
6215
    case above next time round. */
6216
6217
6.40M
    last_code = code;
6218
6.40M
    }
6219
6220
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6221
  where it starts so that it can be quantified when a quantifier follows.
6222
  Checking for the legality of quantifiers happens in parse_regex(), except for
6223
  a quantifier after an assertion that is a condition. */
6224
6225
12.4M
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6226
11.8M
    {
6227
11.8M
    previous = code;
6228
11.8M
    if (matched_char && !had_accept) okreturn = 1;
6229
11.8M
    }
6230
6231
12.4M
  previous_matched_char = matched_char;
6232
12.4M
  matched_char = FALSE;
6233
12.4M
  note_group_empty = FALSE;
6234
12.4M
  skipunits = 0;         /* Default value for most subgroups */
6235
6236
12.4M
  switch(meta)
6237
12.4M
    {
6238
    /* ===================================================================*/
6239
    /* The branch terminates at pattern end or | or ) */
6240
6241
145k
    case META_END:
6242
332k
    case META_ALT:
6243
659k
    case META_KET:
6244
659k
    *firstcuptr = firstcu;
6245
659k
    *firstcuflagsptr = firstcuflags;
6246
659k
    *reqcuptr = reqcu;
6247
659k
    *reqcuflagsptr = reqcuflags;
6248
659k
    *codeptr = code;
6249
659k
    *pptrptr = pptr;
6250
659k
    return okreturn;
6251
6252
6253
    /* ===================================================================*/
6254
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6255
    the setting of any following char as a first character. */
6256
6257
463k
    case META_CIRCUMFLEX:
6258
463k
    if ((options & PCRE2_MULTILINE) != 0)
6259
12.4k
      {
6260
12.4k
      if (firstcuflags == REQ_UNSET)
6261
2.63k
        zerofirstcuflags = firstcuflags = REQ_NONE;
6262
12.4k
      *code++ = OP_CIRCM;
6263
12.4k
      }
6264
451k
    else *code++ = OP_CIRC;
6265
463k
    break;
6266
6267
14.1k
    case META_DOLLAR:
6268
14.1k
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6269
14.1k
    break;
6270
6271
    /* There can never be a first char if '.' is first, whatever happens about
6272
    repeats. The value of reqcu doesn't change either. */
6273
6274
36.9k
    case META_DOT:
6275
36.9k
    matched_char = TRUE;
6276
36.9k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6277
36.9k
    zerofirstcu = firstcu;
6278
36.9k
    zerofirstcuflags = firstcuflags;
6279
36.9k
    zeroreqcu = reqcu;
6280
36.9k
    zeroreqcuflags = reqcuflags;
6281
36.9k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6282
36.9k
    break;
6283
6284
6285
    /* ===================================================================*/
6286
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6287
    Otherwise, an initial ']' is taken as a data character. When empty classes
6288
    are allowed, [] must generate an empty class - we have no dedicated opcode
6289
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6290
    construct would be a clearer way for a pattern author to represent a
6291
    non-matching branch, but it does have different semantics to '[]' if both
6292
    are followed by a quantifier). The empty-negated [^] matches any character,
6293
    so is useful: generate OP_ALLANY for this. */
6294
6295
2.60k
    case META_CLASS_EMPTY:
6296
3.34k
    case META_CLASS_EMPTY_NOT:
6297
3.34k
    matched_char = TRUE;
6298
3.34k
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6299
2.60k
    else
6300
2.60k
      {
6301
2.60k
      *code++ = OP_CLASS;
6302
2.60k
      memset(code, 0, 32);
6303
2.60k
      code += 32 / sizeof(PCRE2_UCHAR);
6304
2.60k
      }
6305
6306
3.34k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6307
3.34k
    zerofirstcu = firstcu;
6308
3.34k
    zerofirstcuflags = firstcuflags;
6309
3.34k
    break;
6310
6311
6312
    /* ===================================================================*/
6313
    /* Non-empty character class. If the included characters are all < 256, we
6314
    build a 32-byte bitmap of the permitted characters, except in the special
6315
    case where there is only one such character. For negated classes, we build
6316
    the map as usual, then invert it at the end. However, we use a different
6317
    opcode so that data characters > 255 can be handled correctly.
6318
6319
    If the class contains characters outside the 0-255 range, a different
6320
    opcode is compiled. It may optionally have a bit map for characters < 256,
6321
    but those above are explicitly listed afterwards. A flag code unit tells
6322
    whether the bitmap is present, and whether this is a negated class or
6323
    not. */
6324
6325
77.2k
    case META_CLASS_NOT:
6326
147k
    case META_CLASS:
6327
147k
    matched_char = TRUE;
6328
6329
    /* Check for complex extended classes and handle them separately. */
6330
6331
147k
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6332
9.24k
      {
6333
9.24k
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6334
9.24k
                                      errorcodeptr, cb, lengthptr))
6335
20
        return 0;
6336
9.22k
      goto CLASS_END_PROCESSING;
6337
9.24k
      }
6338
6339
    /* We can optimize the case of a single character in a class by generating
6340
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6341
    negative. In the negative case there can be no first char if this item is
6342
    first, whatever repeat count may follow. In the case of reqcu, save the
6343
    previous value for reinstating. */
6344
6345
    /* NOTE: at present this optimization is not effective if the only
6346
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6347
6348
137k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6349
56.1k
      {
6350
56.1k
      uint32_t c = pptr[1];
6351
6352
56.1k
      pptr += 2;                 /* Move on to class end */
6353
56.1k
      if (meta == META_CLASS)    /* A positive one-char class can be */
6354
3.77k
        {                        /* handled as a normal literal character. */
6355
3.77k
        meta = c;                /* Set up the character */
6356
3.77k
        goto NORMAL_CHAR_SET;
6357
3.77k
        }
6358
6359
      /* Handle a negative one-character class */
6360
6361
52.3k
      zeroreqcu = reqcu;
6362
52.3k
      zeroreqcuflags = reqcuflags;
6363
52.3k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6364
52.3k
      zerofirstcu = firstcu;
6365
52.3k
      zerofirstcuflags = firstcuflags;
6366
6367
      /* For caseless UTF or UCP mode, check whether this character has more
6368
      than one other case. If so, generate a special OP_NOTPROP item instead of
6369
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6370
      caseless set that starts with an ASCII character. If the character is
6371
      affected by the special Turkish rules, hardcode the not-matching
6372
      characters using a caseset. */
6373
6374
52.3k
#ifdef SUPPORT_UNICODE
6375
52.3k
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6376
6.36k
        {
6377
6.36k
        uint32_t caseset;
6378
6379
6.36k
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6380
6.36k
              PCRE2_EXTRA_TURKISH_CASING &&
6381
0
            UCD_ANY_I(c))
6382
0
          {
6383
0
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6384
0
          }
6385
6.36k
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6386
1.12k
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6387
595
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6388
395
          {
6389
395
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6390
395
          }
6391
6392
6.36k
        if (caseset != 0)
6393
726
          {
6394
726
          *code++ = OP_NOTPROP;
6395
726
          *code++ = PT_CLIST;
6396
726
          *code++ = caseset;
6397
726
          break;   /* We are finished with this class */
6398
726
          }
6399
6.36k
        }
6400
51.6k
#endif
6401
      /* Char has only one other (usable) case, or UCP not available */
6402
6403
51.6k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6404
51.6k
      code += PUTCHAR(c, code);
6405
51.6k
      break;   /* We are finished with this class */
6406
52.3k
      }        /* End of 1-char optimization */
6407
6408
    /* Handle character classes that contain more than just one literal
6409
    character. If there are exactly two characters in a positive class, see if
6410
    they are case partners. This can be optimized to generate a caseless single
6411
    character match (which also sets first/required code units if relevant).
6412
    When casing restrictions apply, ignore a caseless set if both characters
6413
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6414
    Unicode "othercase". */
6415
6416
81.7k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6417
44.2k
        pptr[3] == META_CLASS_END)
6418
13.0k
      {
6419
13.0k
      uint32_t c = pptr[1];
6420
6421
13.0k
#ifdef SUPPORT_UNICODE
6422
13.0k
      if ((UCD_CASESET(c) == 0 ||
6423
1.76k
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6424
891
            c < 128 && pptr[2] < 128)) &&
6425
11.6k
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6426
11.6k
              PCRE2_EXTRA_TURKISH_CASING &&
6427
0
            UCD_ANY_I(c)))
6428
11.6k
#endif
6429
11.6k
        {
6430
11.6k
        uint32_t d;
6431
6432
11.6k
#ifdef SUPPORT_UNICODE
6433
11.6k
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6434
10.4k
#endif
6435
10.4k
          {
6436
10.4k
#if PCRE2_CODE_UNIT_WIDTH != 8
6437
10.4k
          if (c > 255) d = c; else
6438
7.93k
#endif
6439
7.93k
          d = TABLE_GET(c, cb->fcc, c);
6440
10.4k
          }
6441
6442
11.6k
        if (c != d && pptr[2] == d)
6443
663
          {
6444
663
          pptr += 3;                 /* Move on to class end */
6445
663
          meta = c;
6446
663
          if ((options & PCRE2_CASELESS) == 0)
6447
457
            {
6448
457
            reset_caseful = TRUE;
6449
457
            options |= PCRE2_CASELESS;
6450
457
            req_caseopt = REQ_CASELESS;
6451
457
            }
6452
663
          goto CLASS_CASELESS_CHAR;
6453
663
          }
6454
11.6k
        }
6455
13.0k
      }
6456
6457
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6458
6459
81.0k
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6460
81.0k
                                          &code, meta == META_CLASS_NOT, NULL,
6461
81.0k
                                          errorcodeptr, cb, lengthptr);
6462
81.0k
    if (pptr == NULL) return 0;
6463
81.0k
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6464
6465
90.2k
    CLASS_END_PROCESSING:
6466
6467
    /* If this class is the first thing in the branch, there can be no first
6468
    char setting, whatever the repeat count. Any reqcu setting must remain
6469
    unchanged after any kind of repeat. */
6470
6471
90.2k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6472
90.2k
    zerofirstcu = firstcu;
6473
90.2k
    zerofirstcuflags = firstcuflags;
6474
90.2k
    zeroreqcu = reqcu;
6475
90.2k
    zeroreqcuflags = reqcuflags;
6476
90.2k
    break;  /* End of class processing */
6477
6478
6479
    /* ===================================================================*/
6480
    /* Deal with (*VERB)s. */
6481
6482
    /* Check for open captures before ACCEPT and close those that are within
6483
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6484
    assertion. In the first pass, just accumulate the length required;
6485
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6486
    workspace overflow. Do not set firstcu after *ACCEPT. */
6487
6488
12.1k
    case META_ACCEPT:
6489
12.1k
    cb->had_accept = had_accept = TRUE;
6490
12.1k
    for (oc = open_caps;
6491
16.9k
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6492
12.1k
         oc = oc->next)
6493
4.81k
      {
6494
4.81k
      if (lengthptr != NULL)
6495
2.40k
        {
6496
2.40k
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6497
2.40k
        }
6498
2.40k
      else
6499
2.40k
        {
6500
2.40k
        *code++ = OP_CLOSE;
6501
2.40k
        PUT2INC(code, 0, oc->number);
6502
2.40k
        }
6503
4.81k
      }
6504
12.1k
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6505
12.1k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6506
12.1k
    break;
6507
6508
1.68k
    case META_PRUNE:
6509
4.34k
    case META_SKIP:
6510
4.34k
    cb->had_pruneorskip = TRUE;
6511
4.34k
    PCRE2_FALLTHROUGH /* Fall through */
6512
6.09k
    case META_COMMIT:
6513
8.07k
    case META_FAIL:
6514
8.07k
    *code++ = verbops[(meta - META_MARK) >> 16];
6515
8.07k
    break;
6516
6517
5.75k
    case META_THEN:
6518
5.75k
    cb->external_flags |= PCRE2_HASTHEN;
6519
5.75k
    *code++ = OP_THEN;
6520
5.75k
    break;
6521
6522
    /* Handle verbs with arguments. Arguments can be very long, especially in
6523
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6524
    However, the argument length is constrained to be small enough to fit in
6525
    one code unit. This check happens in parse_regex(). In the first pass,
6526
    instead of putting the argument into memory, we just update the length
6527
    counter and set up an empty argument. */
6528
6529
1.01k
    case META_THEN_ARG:
6530
1.01k
    cb->external_flags |= PCRE2_HASTHEN;
6531
1.01k
    goto VERB_ARG;
6532
6533
1.08k
    case META_PRUNE_ARG:
6534
4.34k
    case META_SKIP_ARG:
6535
4.34k
    cb->had_pruneorskip = TRUE;
6536
4.34k
    PCRE2_FALLTHROUGH /* Fall through */
6537
7.96k
    case META_MARK:
6538
8.69k
    case META_COMMIT_ARG:
6539
9.71k
    VERB_ARG:
6540
9.71k
    *code++ = verbops[(meta - META_MARK) >> 16];
6541
    /* The length is in characters. */
6542
9.71k
    verbarglen = *(++pptr);
6543
9.71k
    verbculen = 0;
6544
9.71k
    tempcode = code++;
6545
181k
    for (int i = 0; i < (int)verbarglen; i++)
6546
171k
      {
6547
171k
      meta = *(++pptr);
6548
171k
#ifdef SUPPORT_UNICODE
6549
171k
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6550
153k
#endif
6551
153k
        {
6552
153k
        mclength = 1;
6553
153k
        mcbuffer[0] = meta;
6554
153k
        }
6555
171k
      if (lengthptr != NULL) *lengthptr += mclength; else
6556
79.1k
        {
6557
79.1k
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6558
79.1k
        code += mclength;
6559
79.1k
        verbculen += mclength;
6560
79.1k
        }
6561
171k
      }
6562
6563
9.71k
    *tempcode = verbculen;   /* Fill in the code unit length */
6564
9.71k
    *code++ = 0;             /* Terminating zero */
6565
9.71k
    break;
6566
6567
6568
    /* ===================================================================*/
6569
    /* Handle options change. The new setting must be passed back for use in
6570
    subsequent branches. Reset the greedy defaults and the case value for
6571
    firstcu and reqcu. */
6572
6573
1.74k
    case META_OPTIONS:
6574
1.74k
    *optionsptr = options = *(++pptr);
6575
1.74k
    *xoptionsptr = xoptions = *(++pptr);
6576
1.74k
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6577
1.74k
    greedy_non_default = greedy_default ^ 1;
6578
1.74k
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6579
1.74k
    break;
6580
6581
    /* ===================================================================*/
6582
    /* Handle scan substring. Scan substring assertion starts with META_SCS,
6583
    which recursively calls compile_branch. The first opcode processed by
6584
    this recursive call is always META_OFFSET. */
6585
6586
1.13k
    case META_OFFSET:
6587
1.13k
    if (lengthptr != NULL)
6588
572
      {
6589
572
      pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);
6590
572
      if (pptr == NULL)
6591
8
        return 0;
6592
564
      break;
6593
572
      }
6594
6595
1.68k
    while (TRUE)
6596
1.68k
      {
6597
1.68k
      int count, index;
6598
1.68k
      named_group *ng;
6599
6600
1.68k
      switch (META_CODE(*pptr))
6601
1.68k
        {
6602
561
        case META_OFFSET:
6603
561
        pptr++;
6604
561
        SKIPOFFSET(pptr);
6605
561
        continue;
6606
6607
213
        case META_CAPTURE_NAME:
6608
213
        ng = cb->named_groups + pptr[1];
6609
213
        pptr += 2;
6610
213
        count = 0;
6611
213
        index = 0;
6612
6613
213
        if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6614
213
          &count, errorcodeptr, cb)) return 0;
6615
6616
213
        code[0] = OP_DNCREF;
6617
213
        PUT2(code, 1, index);
6618
213
        PUT2(code, 1 + IMM2_SIZE, count);
6619
213
        code += 1 + 2 * IMM2_SIZE;
6620
213
        continue;
6621
6622
348
        case META_CAPTURE_NUMBER:
6623
348
        pptr += 2;
6624
348
        if (pptr[-1] == 0) continue;
6625
6626
348
        code[0] = OP_CREF;
6627
348
        PUT2(code, 1, pptr[-1]);
6628
348
        code += 1 + IMM2_SIZE;
6629
348
        continue;
6630
6631
561
        default:
6632
561
        break;
6633
1.68k
        }
6634
6635
561
      break;
6636
1.68k
      }
6637
561
    --pptr;
6638
561
    break;
6639
6640
1.13k
    case META_SCS:
6641
1.13k
    bravalue = OP_ASSERT_SCS;
6642
1.13k
    cb->assert_depth += 1;
6643
1.13k
    goto GROUP_PROCESS;
6644
6645
6646
    /* ===================================================================*/
6647
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6648
    because it could be a numerical check on recursion, or a name check on a
6649
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6650
    we can handle it either way. We first try for a name; if not found, process
6651
    the number. */
6652
6653
3.61k
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6654
6.14k
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6655
6.16k
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6656
6.16k
    bravalue = OP_COND;
6657
6658
6.16k
    if (lengthptr != NULL)
6659
3.23k
      {
6660
3.23k
      uint32_t i;
6661
3.23k
      PCRE2_SPTR name;
6662
3.23k
      named_group *ng;
6663
3.23k
      uint32_t *start_pptr = pptr;
6664
3.23k
      uint32_t length = *(++pptr);
6665
6666
3.23k
      GETPLUSOFFSET(offset, pptr);
6667
3.23k
      name = cb->start_pattern + offset;
6668
6669
      /* In the first pass, the names generated in the pre-pass are available,
6670
      but the main name table has not yet been created. Scan the list of names
6671
      generated in the pre-pass in order to get a number and whether or not
6672
      this name is duplicated. If it is not duplicated, we can handle it as a
6673
      numerical group. */
6674
6675
3.23k
      ng = PRIV(compile_find_named_group)(name, length, cb);
6676
6677
3.23k
      if (ng == NULL)
6678
1.95k
        {
6679
        /* If the name was not found we have a bad reference, unless we are
6680
        dealing with R<digits>, which is treated as a recursion test by
6681
        number. */
6682
6683
1.95k
        groupnumber = 0;
6684
1.95k
        if (meta == META_COND_RNUMBER)
6685
1.90k
          {
6686
3.53k
          for (i = 1; i < length; i++)
6687
1.63k
            {
6688
1.63k
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6689
1.63k
            if (groupnumber > MAX_GROUP_NUMBER)
6690
3
              {
6691
3
              *errorcodeptr = ERR61;
6692
3
              cb->erroroffset = offset + i;
6693
3
              return 0;
6694
3
              }
6695
1.63k
            }
6696
1.90k
          }
6697
6698
1.94k
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6699
68
          {
6700
68
          *errorcodeptr = ERR15;
6701
68
          cb->erroroffset = offset;
6702
68
          return 0;
6703
68
          }
6704
6705
        /* (?Rdigits) treated as a recursion reference by number. A value of
6706
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6707
        translated into RREF_ANY (which is 0xffff). */
6708
6709
1.87k
        if (groupnumber == 0) groupnumber = RREF_ANY;
6710
1.87k
        PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);
6711
1.87k
        start_pptr[1] = groupnumber;
6712
1.87k
        skipunits = 1+IMM2_SIZE;
6713
1.87k
        goto GROUP_PROCESS_NOTE_EMPTY;
6714
1.94k
        }
6715
6716
      /* From here on, we know we have a name (not a number),
6717
      so treat META_COND_RNUMBER the same as META_COND_NAME. */
6718
1.28k
      if (meta == META_COND_RNUMBER) meta = META_COND_NAME;
6719
6720
1.28k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
6721
622
        {
6722
        /* Found a non-duplicated name. Since it is a global,
6723
        it is enough to update it in the pre-processing phase. */
6724
622
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6725
6726
622
        start_pptr[0] = meta;
6727
622
        start_pptr[1] = ng->number;
6728
6729
622
        skipunits = 1 + IMM2_SIZE;
6730
622
        goto GROUP_PROCESS_NOTE_EMPTY;
6731
622
        }
6732
6733
      /* We have a duplicated name. In the compile pass we have to search the
6734
      main table in order to get the index and count values. */
6735
6736
667
      start_pptr[0] = meta | 1;
6737
667
      start_pptr[1] = (uint32_t)(ng - cb->named_groups);
6738
6739
      /* A duplicated name was found. Note that if an R<digits> name is found
6740
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6741
667
      skipunits = 1 + 2 * IMM2_SIZE;
6742
667
      }
6743
2.92k
    else
6744
2.92k
      {
6745
      /* Otherwise lengthptr equals to NULL,
6746
      which is the second phase of compilation. */
6747
2.92k
      int count, index;
6748
2.92k
      named_group *ng;
6749
6750
      /* Generate code using the data
6751
      collected in the pre-processing phase. */
6752
6753
2.92k
      if (meta == META_COND_RNUMBER)
6754
1.64k
        {
6755
1.64k
        code[1+LINK_SIZE] = OP_RREF;
6756
1.64k
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6757
1.64k
        skipunits = 1 + IMM2_SIZE;
6758
1.64k
        pptr += 1 + SIZEOFFSET;
6759
1.64k
        goto GROUP_PROCESS_NOTE_EMPTY;
6760
1.64k
        }
6761
6762
1.28k
      if (meta_arg == 0)
6763
616
        {
6764
616
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6765
616
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6766
616
        skipunits = 1 + IMM2_SIZE;
6767
616
        pptr += 1 + SIZEOFFSET;
6768
616
        goto GROUP_PROCESS_NOTE_EMPTY;
6769
616
        }
6770
6771
667
      ng = cb->named_groups + pptr[1];
6772
667
      count = 0;  /* Values for first pass (avoids compiler warning) */
6773
667
      index = 0;
6774
6775
      /* The failed case is an internal error. */
6776
667
      if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6777
667
            &count, errorcodeptr, cb)) return 0;
6778
6779
      /* A duplicated name was found. Note that if an R<digits> name is found
6780
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6781
6782
667
      code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6783
6784
      /* Insert appropriate data values. */
6785
667
      PUT2(code, 2 + LINK_SIZE, index);
6786
667
      PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);
6787
667
      skipunits = 1 + 2 * IMM2_SIZE;
6788
667
      pptr += 1 + SIZEOFFSET;
6789
667
      }
6790
6791
1.33k
    PCRE2_ASSERT(meta != META_CAPTURE_NAME);
6792
1.33k
    goto GROUP_PROCESS_NOTE_EMPTY;
6793
6794
    /* The DEFINE condition is always false. Its internal groups may never
6795
    be called, so matched_char must remain false, hence the jump to
6796
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6797
6798
0
    case META_COND_DEFINE:
6799
0
    bravalue = OP_COND;
6800
0
    GETPLUSOFFSET(offset, pptr);
6801
0
    code[1+LINK_SIZE] = OP_DEFINE;
6802
0
    skipunits = 1;
6803
0
    goto GROUP_PROCESS;
6804
6805
    /* Conditional test of a group's being set. */
6806
6807
1.05k
    case META_COND_NUMBER:
6808
1.05k
    bravalue = OP_COND;
6809
1.05k
    GETPLUSOFFSET(offset, pptr);
6810
6811
1.05k
    groupnumber = *(++pptr);
6812
1.05k
    if (groupnumber > cb->bracount)
6813
35
      {
6814
35
      *errorcodeptr = ERR15;
6815
35
      cb->erroroffset = offset;
6816
35
      return 0;
6817
35
      }
6818
1.01k
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6819
6820
    /* Point at initial ( for too many branches error */
6821
1.01k
    offset -= 2;
6822
1.01k
    code[1+LINK_SIZE] = OP_CREF;
6823
1.01k
    skipunits = 1+IMM2_SIZE;
6824
1.01k
    PUT2(code, 2+LINK_SIZE, groupnumber);
6825
1.01k
    goto GROUP_PROCESS_NOTE_EMPTY;
6826
6827
    /* Test for the PCRE2 version. */
6828
6829
0
    case META_COND_VERSION:
6830
0
    bravalue = OP_COND;
6831
0
    if (pptr[1] > 0)
6832
0
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6833
0
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6834
0
          OP_TRUE : OP_FALSE;
6835
0
    else
6836
0
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6837
0
        OP_TRUE : OP_FALSE;
6838
0
    skipunits = 1;
6839
0
    pptr += 3;
6840
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6841
6842
    /* The condition is an assertion, possibly preceded by a callout. */
6843
6844
12.1k
    case META_COND_ASSERT:
6845
12.1k
    bravalue = OP_COND;
6846
12.1k
    goto GROUP_PROCESS_NOTE_EMPTY;
6847
6848
6849
    /* ===================================================================*/
6850
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6851
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6852
6853
17.2k
    case META_LOOKAHEAD:
6854
17.2k
    bravalue = OP_ASSERT;
6855
17.2k
    cb->assert_depth += 1;
6856
17.2k
    goto GROUP_PROCESS;
6857
6858
22.5k
    case META_LOOKAHEAD_NA:
6859
22.5k
    bravalue = OP_ASSERT_NA;
6860
22.5k
    cb->assert_depth += 1;
6861
22.5k
    goto GROUP_PROCESS;
6862
6863
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6864
    thing to do, but Perl allows all assertions to be quantified, and when
6865
    they contain capturing parentheses there may be a potential use for
6866
    this feature. Not that that applies to a quantified (?!) but we allow
6867
    it for uniformity. */
6868
6869
12.8k
    case META_LOOKAHEADNOT:
6870
12.8k
    if (pptr[1] == META_KET &&
6871
4.29k
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6872
3.15k
      {
6873
3.15k
      *code++ = OP_FAIL;
6874
3.15k
      pptr++;
6875
3.15k
      }
6876
9.67k
    else
6877
9.67k
      {
6878
9.67k
      bravalue = OP_ASSERT_NOT;
6879
9.67k
      cb->assert_depth += 1;
6880
9.67k
      goto GROUP_PROCESS;
6881
9.67k
      }
6882
3.15k
    break;
6883
6884
6.97k
    case META_LOOKBEHIND:
6885
6.97k
    bravalue = OP_ASSERTBACK;
6886
6.97k
    cb->assert_depth += 1;
6887
6.97k
    goto GROUP_PROCESS;
6888
6889
6.73k
    case META_LOOKBEHINDNOT:
6890
6.73k
    bravalue = OP_ASSERTBACK_NOT;
6891
6.73k
    cb->assert_depth += 1;
6892
6.73k
    goto GROUP_PROCESS;
6893
6894
3.10k
    case META_LOOKBEHIND_NA:
6895
3.10k
    bravalue = OP_ASSERTBACK_NA;
6896
3.10k
    cb->assert_depth += 1;
6897
3.10k
    goto GROUP_PROCESS;
6898
6899
8.98k
    case META_ATOMIC:
6900
8.98k
    bravalue = OP_ONCE;
6901
8.98k
    goto GROUP_PROCESS_NOTE_EMPTY;
6902
6903
4.50k
    case META_SCRIPT_RUN:
6904
4.50k
    bravalue = OP_SCRIPT_RUN;
6905
4.50k
    goto GROUP_PROCESS_NOTE_EMPTY;
6906
6907
20.9k
    case META_NOCAPTURE:
6908
20.9k
    bravalue = OP_BRA;
6909
    /* Fall through */
6910
6911
    /* Process nested bracketed regex. The nesting depth is maintained for the
6912
    benefit of the stackguard function. The test for too deep nesting is now
6913
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6914
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6915
    note of whether or not they may match an empty string. */
6916
6917
260k
    GROUP_PROCESS_NOTE_EMPTY:
6918
260k
    note_group_empty = TRUE;
6919
6920
327k
    GROUP_PROCESS:
6921
327k
    cb->parens_depth += 1;
6922
327k
    *code = bravalue;
6923
327k
    pptr++;
6924
327k
    tempcode = code;
6925
327k
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6926
327k
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6927
6928
327k
    if ((group_return =
6929
327k
         compile_regex(
6930
327k
         options,                         /* The options state */
6931
327k
         xoptions,                        /* The extra options state */
6932
327k
         &tempcode,                       /* Where to put code (updated) */
6933
327k
         &pptr,                           /* Input pointer (updated) */
6934
327k
         errorcodeptr,                    /* Where to put an error message */
6935
327k
         skipunits,                       /* Skip over bracket number */
6936
327k
         &subfirstcu,                     /* For possible first char */
6937
327k
         &subfirstcuflags,
6938
327k
         &subreqcu,                       /* For possible last char */
6939
327k
         &subreqcuflags,
6940
327k
         bcptr,                           /* Current branch chain */
6941
327k
         open_caps,                       /* Pointer to capture stack */
6942
327k
         cb,                              /* Compile data block */
6943
327k
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6944
327k
           &length_prevgroup              /* Pre-compile phase */
6945
327k
         )) == 0)
6946
610
      return 0;  /* Error */
6947
6948
327k
    cb->parens_depth -= 1;
6949
6950
    /* If that was a non-conditional significant group (not an assertion, not a
6951
    DEFINE) that matches at least one character, then the current item matches
6952
    a character. Conditionals are handled below. */
6953
6954
327k
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6955
83.2k
      matched_char = TRUE;
6956
6957
    /* If we've just compiled an assertion, pop the assert depth. */
6958
6959
327k
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6960
67.3k
      cb->assert_depth -= 1;
6961
6962
    /* At the end of compiling, code is still pointing to the start of the
6963
    group, while tempcode has been updated to point past the end of the group.
6964
    The parsed pattern pointer (pptr) is on the closing META_KET.
6965
6966
    If this is a conditional bracket, check that there are no more than
6967
    two branches in the group, or just one if it's a DEFINE group. We do this
6968
    in the real compile phase, not in the pre-pass, where the whole group may
6969
    not be available. */
6970
6971
327k
    if (bravalue == OP_COND && lengthptr == NULL)
6972
9.45k
      {
6973
9.45k
      PCRE2_UCHAR *tc = code;
6974
9.45k
      int condcount = 0;
6975
6976
11.2k
      do {
6977
11.2k
         condcount++;
6978
11.2k
         tc += GET(tc,1);
6979
11.2k
         }
6980
11.2k
      while (*tc != OP_KET);
6981
6982
      /* A DEFINE group is never obeyed inline (the "condition" is always
6983
      false). It must have only one branch. Having checked this, change the
6984
      opcode to OP_FALSE. */
6985
6986
9.45k
      if (code[LINK_SIZE+1] == OP_DEFINE)
6987
0
        {
6988
0
        if (condcount > 1)
6989
0
          {
6990
0
          cb->erroroffset = offset;
6991
0
          *errorcodeptr = ERR54;
6992
0
          return 0;
6993
0
          }
6994
0
        code[LINK_SIZE+1] = OP_FALSE;
6995
0
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6996
0
        }
6997
6998
      /* A "normal" conditional group. If there is just one branch, we must not
6999
      make use of its firstcu or reqcu, because this is equivalent to an
7000
      empty second branch. Also, it may match an empty string. If there are two
7001
      branches, this item must match a character if the group must. */
7002
7003
9.45k
      else
7004
9.45k
        {
7005
9.45k
        if (condcount > 2)
7006
23
          {
7007
23
          cb->erroroffset = offset;
7008
23
          *errorcodeptr = ERR27;
7009
23
          return 0;
7010
23
          }
7011
9.42k
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7012
1.53k
          else if (group_return > 0) matched_char = TRUE;
7013
9.42k
        }
7014
9.45k
      }
7015
7016
    /* In the pre-compile phase, update the length by the length of the group,
7017
    less the brackets at either end. Then reduce the compiled code to just a
7018
    set of non-capturing brackets so that it doesn't use much memory if it is
7019
    duplicated by a quantifier.*/
7020
7021
327k
    if (lengthptr != NULL)
7022
165k
      {
7023
165k
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7024
0
        {
7025
0
        *errorcodeptr = ERR20;
7026
0
        return 0;
7027
0
        }
7028
165k
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7029
165k
      code++;   /* This already contains bravalue */
7030
165k
      PUTINC(code, 0, 1 + LINK_SIZE);
7031
165k
      *code++ = OP_KET;
7032
165k
      PUTINC(code, 0, 1 + LINK_SIZE);
7033
165k
      break;    /* No need to waste time with special character handling */
7034
165k
      }
7035
7036
    /* Otherwise update the main code pointer to the end of the group. */
7037
7038
161k
    code = tempcode;
7039
7040
    /* For a DEFINE group, required and first character settings are not
7041
    relevant. */
7042
7043
161k
    if (bravalue == OP_DEFINE) break;
7044
7045
    /* Handle updating of the required and first code units for other types of
7046
    group. Update for normal brackets of all kinds, and conditions with two
7047
    branches (see code above). If the bracket is followed by a quantifier with
7048
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
7049
    zerofirstcu outside the main loop so that they can be accessed for the back
7050
    off. */
7051
7052
161k
    zeroreqcu = reqcu;
7053
161k
    zeroreqcuflags = reqcuflags;
7054
161k
    zerofirstcu = firstcu;
7055
161k
    zerofirstcuflags = firstcuflags;
7056
161k
    groupsetfirstcu = FALSE;
7057
7058
161k
    if (bravalue >= OP_ONCE)  /* Not an assertion */
7059
128k
      {
7060
      /* If we have not yet set a firstcu in this branch, take it from the
7061
      subpattern, remembering that it was set here so that a repeat of more
7062
      than one can replicate it as reqcu if necessary. If the subpattern has
7063
      no firstcu, set "none" for the whole branch. In both cases, a zero
7064
      repeat forces firstcu to "none". */
7065
7066
128k
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7067
20.9k
        {
7068
20.9k
        if (subfirstcuflags < REQ_NONE)
7069
6.99k
          {
7070
6.99k
          firstcu = subfirstcu;
7071
6.99k
          firstcuflags = subfirstcuflags;
7072
6.99k
          groupsetfirstcu = TRUE;
7073
6.99k
          }
7074
13.9k
        else firstcuflags = REQ_NONE;
7075
20.9k
        zerofirstcuflags = REQ_NONE;
7076
20.9k
        }
7077
7078
      /* If firstcu was previously set, convert the subpattern's firstcu
7079
      into reqcu if there wasn't one, using the vary flag that was in
7080
      existence beforehand. */
7081
7082
107k
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
7083
3.18k
        {
7084
3.18k
        subreqcu = subfirstcu;
7085
3.18k
        subreqcuflags = subfirstcuflags | tempreqvary;
7086
3.18k
        }
7087
7088
      /* If the subpattern set a required code unit (or set a first code unit
7089
      that isn't really the first code unit - see above), set it. */
7090
7091
128k
      if (subreqcuflags < REQ_NONE)
7092
29.2k
        {
7093
29.2k
        reqcu = subreqcu;
7094
29.2k
        reqcuflags = subreqcuflags;
7095
29.2k
        }
7096
128k
      }
7097
7098
    /* For a forward assertion, we take the reqcu, if set, provided that the
7099
    group has also set a firstcu. This can be helpful if the pattern that
7100
    follows the assertion doesn't set a different char. For example, it's
7101
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7102
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7103
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7104
    overcome by a scan at the end if there's no firstcu, looking for an
7105
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7106
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7107
    in that example, 'X' ends up set for both. */
7108
7109
33.2k
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7110
19.7k
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7111
11.1k
      {
7112
11.1k
      reqcu = subreqcu;
7113
11.1k
      reqcuflags = subreqcuflags;
7114
11.1k
      }
7115
7116
161k
    break;  /* End of nested group handling */
7117
7118
7119
    /* ===================================================================*/
7120
    /* Handle named backreferences and recursions. */
7121
7122
4.50k
    case META_BACKREF_BYNAME:
7123
4.94k
    case META_RECURSE_BYNAME:
7124
4.94k
      {
7125
4.94k
      int count, index;
7126
4.94k
      PCRE2_SPTR name;
7127
4.94k
      named_group *ng;
7128
4.94k
      uint32_t length = *(++pptr);
7129
7130
4.94k
      GETPLUSOFFSET(offset, pptr);
7131
4.94k
      name = cb->start_pattern + offset;
7132
7133
      /* In the first pass, the names generated in the pre-pass are available,
7134
      but the main name table has not yet been created. Scan the list of names
7135
      generated in the pre-pass in order to get a number and whether or not
7136
      this name is duplicated. */
7137
7138
4.94k
      ng = PRIV(compile_find_named_group)(name, length, cb);
7139
7140
4.94k
      if (ng == NULL)
7141
76
        {
7142
        /* If the name was not found we have a bad reference. */
7143
76
        *errorcodeptr = ERR15;
7144
76
        cb->erroroffset = offset;
7145
76
        return 0;
7146
76
        }
7147
7148
4.86k
      groupnumber = ng->number;
7149
7150
      /* For a recursion, that's all that is needed. We can now go to
7151
      the code that handles numerical recursion, applying it to the first
7152
      group with the given name. */
7153
7154
4.86k
      if (meta == META_RECURSE_BYNAME)
7155
406
        {
7156
406
        meta_arg = groupnumber;
7157
406
        goto HANDLE_NUMERICAL_RECURSION;
7158
406
        }
7159
7160
      /* For a back reference, update the back reference map and the
7161
      maximum back reference. */
7162
7163
4.46k
      cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7164
4.46k
      if (groupnumber > cb->top_backref)
7165
474
        cb->top_backref = groupnumber;
7166
7167
      /* If a back reference name is not duplicated, we can handle it as
7168
      a numerical reference. */
7169
7170
4.46k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
7171
665
        {
7172
665
        meta_arg = groupnumber;
7173
665
        goto HANDLE_SINGLE_REFERENCE;
7174
665
        }
7175
7176
      /* If a back reference name is duplicated, we generate a different
7177
      opcode to a numerical back reference. In the second pass we must
7178
      search for the index and count in the final name table. */
7179
7180
3.79k
      count = 0;  /* Values for first pass (avoids compiler warning) */
7181
3.79k
      index = 0;
7182
3.79k
      if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,
7183
1.88k
            &index, &count, errorcodeptr, cb)) return 0;
7184
7185
3.79k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7186
3.79k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7187
3.79k
      PUT2INC(code, 0, index);
7188
3.79k
      PUT2INC(code, 0, count);
7189
3.79k
      if ((options & PCRE2_CASELESS) != 0)
7190
1.98k
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7191
1.98k
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7192
1.98k
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7193
1.98k
                   REFI_FLAG_TURKISH_CASING : 0);
7194
3.79k
      }
7195
0
    break;
7196
7197
7198
    /* ===================================================================*/
7199
    /* Handle a numerical callout. */
7200
7201
1.00M
    case META_CALLOUT_NUMBER:
7202
1.00M
    code[0] = OP_CALLOUT;
7203
1.00M
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7204
1.00M
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7205
1.00M
    code[1 + 2*LINK_SIZE] = pptr[3];
7206
1.00M
    pptr += 3;
7207
1.00M
    code += PRIV(OP_lengths)[OP_CALLOUT];
7208
1.00M
    break;
7209
7210
7211
    /* ===================================================================*/
7212
    /* Handle a callout with a string argument. In the pre-pass we just compute
7213
    the length without generating anything. The length in pptr[3] includes both
7214
    delimiters; in the actual compile only the first one is copied, but a
7215
    terminating zero is added. Any doubled delimiters within the string make
7216
    this an overestimate, but it is not worth bothering about. */
7217
7218
2.28k
    case META_CALLOUT_STRING:
7219
2.28k
    if (lengthptr != NULL)
7220
1.16k
      {
7221
1.16k
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7222
1.16k
      pptr += 3;
7223
1.16k
      SKIPOFFSET(pptr);
7224
1.16k
      }
7225
7226
    /* In the real compile we can copy the string. The starting delimiter is
7227
     included so that the client can discover it if they want. We also pass the
7228
     start offset to help a script language give better error messages. */
7229
7230
1.12k
    else
7231
1.12k
      {
7232
1.12k
      PCRE2_SPTR pp;
7233
1.12k
      uint32_t delimiter;
7234
1.12k
      uint32_t length = pptr[3];
7235
1.12k
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7236
7237
1.12k
      code[0] = OP_CALLOUT_STR;
7238
1.12k
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7239
1.12k
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7240
7241
1.12k
      pptr += 3;
7242
1.12k
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7243
1.12k
      pp = cb->start_pattern + offset;
7244
1.12k
      delimiter = *callout_string++ = *pp++;
7245
1.12k
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7246
70
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7247
1.12k
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7248
7249
      /* The syntax of the pattern was checked in the parsing scan. The length
7250
      includes both delimiters, but we have passed the opening one just above,
7251
      so we reduce length before testing it. The test is for > 1 because we do
7252
      not want to copy the final delimiter. This also ensures that pp[1] is
7253
      accessible. */
7254
7255
8.87k
      while (--length > 1)
7256
7.75k
        {
7257
7.75k
        if (*pp == delimiter && pp[1] == delimiter)
7258
196
          {
7259
196
          *callout_string++ = delimiter;
7260
196
          pp += 2;
7261
196
          length--;
7262
196
          }
7263
7.56k
        else *callout_string++ = *pp++;
7264
7.75k
        }
7265
1.12k
      *callout_string++ = CHAR_NUL;
7266
7267
      /* Set the length of the entire item, the advance to its end. */
7268
7269
1.12k
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7270
1.12k
      code = callout_string;
7271
1.12k
      }
7272
2.28k
    break;
7273
7274
7275
    /* ===================================================================*/
7276
    /* Handle repetition. The different types are all sorted out in the parsing
7277
    pass. */
7278
7279
9.01k
    case META_MINMAX_PLUS:
7280
11.0k
    case META_MINMAX_QUERY:
7281
90.0k
    case META_MINMAX:
7282
90.0k
    repeat_min = *(++pptr);
7283
90.0k
    repeat_max = *(++pptr);
7284
90.0k
    goto REPEAT;
7285
7286
123k
    case META_ASTERISK:
7287
131k
    case META_ASTERISK_PLUS:
7288
134k
    case META_ASTERISK_QUERY:
7289
134k
    repeat_min = 0;
7290
134k
    repeat_max = REPEAT_UNLIMITED;
7291
134k
    goto REPEAT;
7292
7293
159k
    case META_PLUS:
7294
280k
    case META_PLUS_PLUS:
7295
281k
    case META_PLUS_QUERY:
7296
281k
    repeat_min = 1;
7297
281k
    repeat_max = REPEAT_UNLIMITED;
7298
281k
    goto REPEAT;
7299
7300
100k
    case META_QUERY:
7301
103k
    case META_QUERY_PLUS:
7302
108k
    case META_QUERY_QUERY:
7303
108k
    repeat_min = 0;
7304
108k
    repeat_max = 1;
7305
7306
614k
    REPEAT:
7307
614k
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7308
7309
    /* Remember whether this is a variable length repeat, and default to
7310
    single-char opcodes. */
7311
7312
614k
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7313
7314
    /* Adjust first and required code units for a zero repeat. */
7315
7316
614k
    if (repeat_min == 0)
7317
258k
      {
7318
258k
      firstcu = zerofirstcu;
7319
258k
      firstcuflags = zerofirstcuflags;
7320
258k
      reqcu = zeroreqcu;
7321
258k
      reqcuflags = zeroreqcuflags;
7322
258k
      }
7323
7324
    /* Note the greediness and possessiveness. */
7325
7326
614k
    switch (meta)
7327
614k
      {
7328
9.01k
      case META_MINMAX_PLUS:
7329
16.7k
      case META_ASTERISK_PLUS:
7330
137k
      case META_PLUS_PLUS:
7331
140k
      case META_QUERY_PLUS:
7332
140k
      repeat_type = 0;                  /* Force greedy */
7333
140k
      possessive_quantifier = TRUE;
7334
140k
      break;
7335
7336
2.01k
      case META_MINMAX_QUERY:
7337
4.80k
      case META_ASTERISK_QUERY:
7338
6.59k
      case META_PLUS_QUERY:
7339
11.6k
      case META_QUERY_QUERY:
7340
11.6k
      repeat_type = greedy_non_default;
7341
11.6k
      possessive_quantifier = FALSE;
7342
11.6k
      break;
7343
7344
462k
      default:
7345
462k
      repeat_type = greedy_default;
7346
462k
      possessive_quantifier = FALSE;
7347
462k
      break;
7348
614k
      }
7349
7350
    /* Save start of previous item, in case we have to move it up in order to
7351
    insert something before it, and remember what it was. */
7352
7353
614k
    PCRE2_ASSERT(previous != NULL);
7354
614k
    tempcode = previous;
7355
614k
    op_previous = *previous;
7356
7357
    /* Now handle repetition for the different types of item. If the repeat
7358
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7359
    non-parenthesized items, as they have only one alternative. For anything in
7360
    parentheses, we must not ignore if {1} is possessive. */
7361
7362
614k
    switch (op_previous)
7363
614k
      {
7364
      /* If previous was a character or negated character match, abolish the
7365
      item and generate a repeat item instead. If a char item has a minimum of
7366
      more than one, ensure that it is set in reqcu - it might not be if a
7367
      sequence such as x{3} is the first thing in a branch because the x will
7368
      have gone into firstcu instead.  */
7369
7370
155k
      case OP_CHAR:
7371
220k
      case OP_CHARI:
7372
243k
      case OP_NOT:
7373
258k
      case OP_NOTI:
7374
258k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7375
258k
      op_type = chartypeoffset[op_previous - OP_CHAR];
7376
7377
      /* Deal with UTF characters that take up more than one code unit. */
7378
7379
258k
#ifdef MAYBE_UTF_MULTI
7380
258k
      if (utf && NOT_FIRSTCU(code[-1]))
7381
3.10k
        {
7382
3.10k
        PCRE2_UCHAR *lastchar = code - 1;
7383
3.10k
        BACKCHAR(lastchar);
7384
3.10k
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7385
3.10k
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7386
3.10k
        }
7387
255k
      else
7388
255k
#endif  /* MAYBE_UTF_MULTI */
7389
7390
      /* Handle the case of a single code unit - either with no UTF support, or
7391
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7392
      case, for a repeated positive match, get the caseless flag for the
7393
      required code unit from the previous character, because a class like [Aa]
7394
      sets a caseless A but by now the req_caseopt flag has been reset. */
7395
7396
255k
        {
7397
255k
        mcbuffer[0] = code[-1];
7398
255k
        mclength = 1;
7399
255k
        if (op_previous <= OP_CHARI && repeat_min > 1)
7400
7.56k
          {
7401
7.56k
          reqcu = mcbuffer[0];
7402
7.56k
          reqcuflags = cb->req_varyopt;
7403
7.56k
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7404
7.56k
          }
7405
255k
        }
7406
258k
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7407
7408
      /* If previous was a character class or a back reference, we put the
7409
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7410
7411
0
#ifdef SUPPORT_WIDE_CHARS
7412
16.3k
      case OP_XCLASS:
7413
18.2k
      case OP_ECLASS:
7414
18.2k
#endif
7415
28.1k
      case OP_CLASS:
7416
32.6k
      case OP_NCLASS:
7417
36.0k
      case OP_REF:
7418
37.8k
      case OP_REFI:
7419
38.1k
      case OP_DNREF:
7420
38.8k
      case OP_DNREFI:
7421
7422
38.8k
      if (repeat_max == 0)
7423
218
        {
7424
218
        code = previous;
7425
218
        goto END_REPEAT;
7426
218
        }
7427
38.6k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7428
7429
38.3k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7430
10.2k
        *code++ = OP_CRSTAR + repeat_type;
7431
28.0k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7432
11.1k
        *code++ = OP_CRPLUS + repeat_type;
7433
16.9k
      else if (repeat_min == 0 && repeat_max == 1)
7434
8.58k
        *code++ = OP_CRQUERY + repeat_type;
7435
8.32k
      else
7436
8.32k
        {
7437
8.32k
        *code++ = OP_CRRANGE + repeat_type;
7438
8.32k
        PUT2INC(code, 0, repeat_min);
7439
8.32k
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7440
8.32k
        PUT2INC(code, 0, repeat_max);
7441
8.32k
        }
7442
38.3k
      break;
7443
7444
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7445
      because pcre2_match() could not handle backtracking into recursively
7446
      called groups. Now that this backtracking is available, we no longer need
7447
      to do this. However, we still need to replicate recursions as we do for
7448
      groups so as to have independent backtracking points. We can replicate
7449
      for the minimum number of repeats directly. For optional repeats we now
7450
      wrap the recursion in OP_BRA brackets and make use of the bracket
7451
      repetition. */
7452
7453
7.55k
      case OP_RECURSE:
7454
7.55k
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7455
234
        goto END_REPEAT;
7456
7457
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7458
      minimum is 1 and the maximum unlimited, because that can be handled with
7459
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7460
      minimum, we just need to generate the appropriate additional copies.
7461
      Otherwise we need to generate one more, to simulate the situation when
7462
      the minimum is zero. */
7463
7464
7.32k
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7465
1.86k
        {
7466
1.86k
        int replicate = repeat_min;
7467
7468
1.86k
        if (repeat_min == repeat_max) replicate--;
7469
7470
        /* In the pre-compile phase, we don't actually do the replication. We
7471
        just adjust the length as if we had. Do some paranoid checks for
7472
        potential integer overflow. */
7473
7474
1.86k
        if (lengthptr != NULL)
7475
954
          {
7476
954
          PCRE2_SIZE delta;
7477
954
          if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||
7478
954
              OFLOW_MAX - *lengthptr < delta)
7479
0
            {
7480
0
            *errorcodeptr = ERR20;
7481
0
            return 0;
7482
0
            }
7483
954
          *lengthptr += delta;
7484
954
          }
7485
5.35k
        else for (int i = 0; i < replicate; i++)
7486
4.44k
          {
7487
4.44k
          memcpy(code, previous, CU2BYTES(length_prevgroup));
7488
4.44k
          previous = code;
7489
4.44k
          code += length_prevgroup;
7490
4.44k
          }
7491
7492
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7493
        the counts and fall through. */
7494
7495
1.86k
        if (repeat_min == repeat_max) break;
7496
619
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7497
619
        repeat_min = 0;
7498
619
        }
7499
7500
      /* Wrap the recursion call in OP_BRA brackets. */
7501
6.07k
        {
7502
6.07k
        PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;
7503
7504
6.07k
        (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));
7505
6.07k
        op_previous = *previous = OP_BRA;
7506
6.07k
        PUT(previous, 1, 1 + LINK_SIZE + length);
7507
6.07k
        previous[1 + LINK_SIZE + length] = OP_KET;
7508
6.07k
        PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);
7509
6.07k
        }
7510
6.07k
      code += 2 + 2 * LINK_SIZE;
7511
6.07k
      length_prevgroup += 2 + 2 * LINK_SIZE;
7512
6.07k
      group_return = -1;  /* Set "may match empty string" */
7513
7514
      /* Now treat as a repeated OP_BRA. */
7515
6.07k
      PCRE2_FALLTHROUGH /* Fall through */
7516
7517
      /* If previous was a bracket group, we may have to replicate it in
7518
      certain cases. Note that at this point we can encounter only the "basic"
7519
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7520
      converted into the more special varieties such as BRAPOS and SBRA.
7521
      Originally, PCRE did not allow repetition of assertions, but now it does,
7522
      for Perl compatibility. */
7523
7524
10.9k
      case OP_ASSERT:
7525
16.5k
      case OP_ASSERT_NOT:
7526
28.9k
      case OP_ASSERT_NA:
7527
31.3k
      case OP_ASSERTBACK:
7528
33.9k
      case OP_ASSERTBACK_NOT:
7529
35.0k
      case OP_ASSERTBACK_NA:
7530
35.5k
      case OP_ASSERT_SCS:
7531
42.0k
      case OP_ONCE:
7532
45.1k
      case OP_SCRIPT_RUN:
7533
58.0k
      case OP_BRA:
7534
186k
      case OP_CBRA:
7535
190k
      case OP_COND:
7536
190k
        {
7537
190k
        int len = (int)(code - previous);
7538
190k
        PCRE2_UCHAR *bralink = NULL;
7539
190k
        PCRE2_UCHAR *brazeroptr = NULL;
7540
7541
190k
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7542
494
          goto END_REPEAT;
7543
7544
        /* Repeating a DEFINE group (or any group where the condition is always
7545
        FALSE and there is only one branch) is pointless, but Perl allows the
7546
        syntax, so we just ignore the repeat. */
7547
7548
189k
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7549
0
            previous[GET(previous, 1)] != OP_ALT)
7550
0
          goto END_REPEAT;
7551
7552
        /* Perl allows all assertions to be quantified, and when they contain
7553
        capturing parentheses and/or are optional there are potential uses for
7554
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7555
        invalid grounds that further repetition was never useful. This was
7556
        always a bit pointless, since an assertion could be wrapped with a
7557
        repeated group to achieve the effect. General repetition is now
7558
        permitted, but if the maximum is unlimited it is set to one more than
7559
        the minimum. */
7560
7561
189k
        if (op_previous < OP_ONCE)    /* Assertion */
7562
29.3k
          {
7563
29.3k
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7564
29.3k
          }
7565
7566
        /* The case of a zero minimum is special because of the need to stick
7567
        OP_BRAZERO in front of it, and because the group appears once in the
7568
        data, whereas in other cases it appears the minimum number of times. For
7569
        this reason, it is simplest to treat this case separately, as otherwise
7570
        the code gets far too messy. There are several special subcases when the
7571
        minimum is zero. */
7572
7573
189k
        if (repeat_min == 0)
7574
21.1k
          {
7575
          /* If the maximum is also zero, we used to just omit the group from
7576
          the output altogether, like this:
7577
7578
          ** if (repeat_max == 0)
7579
          **   {
7580
          **   code = previous;
7581
          **   goto END_REPEAT;
7582
          **   }
7583
7584
          However, that fails when a group or a subgroup within it is
7585
          referenced as a subroutine from elsewhere in the pattern, so now we
7586
          stick in OP_SKIPZERO in front of it so that it is skipped on
7587
          execution. As we don't have a list of which groups are referenced, we
7588
          cannot do this selectively.
7589
7590
          If the maximum is 1 or unlimited, we just have to stick in the
7591
          BRAZERO and do no more at this point. */
7592
7593
21.1k
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7594
16.0k
            {
7595
16.0k
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7596
16.0k
            code++;
7597
16.0k
            if (repeat_max == 0)
7598
658
              {
7599
658
              *previous++ = OP_SKIPZERO;
7600
658
              goto END_REPEAT;
7601
658
              }
7602
15.3k
            brazeroptr = previous;    /* Save for possessive optimizing */
7603
15.3k
            *previous++ = OP_BRAZERO + repeat_type;
7604
15.3k
            }
7605
7606
          /* If the maximum is greater than 1 and limited, we have to replicate
7607
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7608
          The first one has to be handled carefully because it's the original
7609
          copy, which has to be moved up. The remainder can be handled by code
7610
          that is common with the non-zero minimum case below. We have to
7611
          adjust the value or repeat_max, since one less copy is required. */
7612
7613
5.13k
          else
7614
5.13k
            {
7615
5.13k
            int linkoffset;
7616
5.13k
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7617
5.13k
            code += 2 + LINK_SIZE;
7618
5.13k
            *previous++ = OP_BRAZERO + repeat_type;
7619
5.13k
            *previous++ = OP_BRA;
7620
7621
            /* We chain together the bracket link offset fields that have to be
7622
            filled in later when the ends of the brackets are reached. */
7623
7624
5.13k
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7625
5.13k
            bralink = previous;
7626
5.13k
            PUTINC(previous, 0, linkoffset);
7627
5.13k
            }
7628
7629
20.5k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7630
20.5k
          }
7631
7632
        /* If the minimum is greater than zero, replicate the group as many
7633
        times as necessary, and adjust the maximum to the number of subsequent
7634
        copies that we need. */
7635
7636
168k
        else
7637
168k
          {
7638
168k
          if (repeat_min > 1)
7639
37.4k
            {
7640
            /* In the pre-compile phase, we don't actually do the replication.
7641
            We just adjust the length as if we had. Do some paranoid checks for
7642
            potential integer overflow. */
7643
7644
37.4k
            if (lengthptr != NULL)
7645
18.9k
              {
7646
18.9k
              PCRE2_SIZE delta;
7647
18.9k
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7648
18.9k
                                 (int)length_prevgroup) ||
7649
18.9k
                  OFLOW_MAX - *lengthptr < delta)
7650
5
                {
7651
5
                *errorcodeptr = ERR20;
7652
5
                return 0;
7653
5
                }
7654
18.9k
              *lengthptr += delta;
7655
18.9k
              }
7656
7657
            /* This is compiling for real. If there is a set first code unit
7658
            for the group, and we have not yet set a "required code unit", set
7659
            it. */
7660
7661
18.5k
            else
7662
18.5k
              {
7663
18.5k
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7664
430
                {
7665
430
                reqcu = firstcu;
7666
430
                reqcuflags = firstcuflags;
7667
430
                }
7668
5.02M
              for (uint32_t i = 1; i < repeat_min; i++)
7669
5.00M
                {
7670
5.00M
                memcpy(code, previous, CU2BYTES(len));
7671
5.00M
                code += len;
7672
5.00M
                }
7673
18.5k
              }
7674
37.4k
            }
7675
7676
168k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7677
168k
          }
7678
7679
        /* This code is common to both the zero and non-zero minimum cases. If
7680
        the maximum is limited, it replicates the group in a nested fashion,
7681
        remembering the bracket starts on a stack. In the case of a zero
7682
        minimum, the first one was set up above. In all cases the repeat_max
7683
        now specifies the number of additional copies needed. Again, we must
7684
        remember to replicate entries on the forward reference list. */
7685
7686
189k
        if (repeat_max != REPEAT_UNLIMITED)
7687
53.4k
          {
7688
          /* In the pre-compile phase, we don't actually do the replication. We
7689
          just adjust the length as if we had. For each repetition we must add
7690
          1 to the length for BRAZERO and for all but the last repetition we
7691
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7692
          paranoid checks to avoid integer overflow. */
7693
7694
53.4k
          if (lengthptr != NULL && repeat_max > 0)
7695
5.05k
            {
7696
5.05k
            PCRE2_SIZE delta;
7697
5.05k
            if (PRIV(ckd_smul)(&delta, repeat_max,
7698
5.05k
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7699
5.05k
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7700
3
              {
7701
3
              *errorcodeptr = ERR20;
7702
3
              return 0;
7703
3
              }
7704
5.05k
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7705
5.05k
            *lengthptr += delta;
7706
5.05k
            }
7707
7708
          /* This is compiling for real */
7709
7710
585k
          else for (uint32_t i = repeat_max; i >= 1; i--)
7711
537k
            {
7712
537k
            *code++ = OP_BRAZERO + repeat_type;
7713
7714
            /* All but the final copy start a new nesting, maintaining the
7715
            chain of brackets outstanding. */
7716
7717
537k
            if (i != 1)
7718
532k
              {
7719
532k
              int linkoffset;
7720
532k
              *code++ = OP_BRA;
7721
532k
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7722
532k
              bralink = code;
7723
532k
              PUTINC(code, 0, linkoffset);
7724
532k
              }
7725
7726
537k
            memcpy(code, previous, CU2BYTES(len));
7727
537k
            code += len;
7728
537k
            }
7729
7730
          /* Now chain through the pending brackets, and fill in their length
7731
          fields (which are holding the chain links pro tem). */
7732
7733
591k
          while (bralink != NULL)
7734
537k
            {
7735
537k
            int oldlinkoffset;
7736
537k
            int linkoffset = (int)(code - bralink + 1);
7737
537k
            PCRE2_UCHAR *bra = code - linkoffset;
7738
537k
            oldlinkoffset = GET(bra, 1);
7739
537k
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7740
537k
            *code++ = OP_KET;
7741
537k
            PUTINC(code, 0, linkoffset);
7742
537k
            PUT(bra, 1, linkoffset);
7743
537k
            }
7744
53.4k
          }
7745
7746
        /* If the maximum is unlimited, set a repeater in the final copy. For
7747
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7748
        possessively repeated ONCE brackets can be converted into non-capturing
7749
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7750
        saves having to deal with possessive ONCEs specially.
7751
7752
        Otherwise, when we are doing the actual compile phase, check to see
7753
        whether this group is one that could match an empty string. If so,
7754
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7755
        that runtime checking can be done. [This check is also applied to ONCE
7756
        and SCRIPT_RUN groups at runtime, but in a different way.]
7757
7758
        Then, if the quantifier was possessive and the bracket is not a
7759
        conditional, we convert the BRA code to the POS form, and the KET code
7760
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7761
        kind of subpattern at both the start and at the end.) The use of
7762
        special opcodes makes it possible to reduce greatly the stack usage in
7763
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7764
        OP_BRAPOSZERO.
7765
7766
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7767
        flag so that the default action below, of wrapping everything inside
7768
        atomic brackets, does not happen. When the minimum is greater than 1,
7769
        there will be earlier copies of the group, and so we still have to wrap
7770
        the whole thing. */
7771
7772
135k
        else
7773
135k
          {
7774
135k
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7775
135k
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7776
7777
          /* Convert possessive ONCE brackets to non-capturing */
7778
7779
135k
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7780
7781
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7782
          to do is to set the KET. */
7783
7784
135k
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7785
2.37k
            *ketcode = OP_KETRMAX + repeat_type;
7786
7787
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7788
          (which have been converted to non-capturing above). */
7789
7790
133k
          else
7791
133k
            {
7792
            /* In the compile phase, adjust the opcode if the group can match
7793
            an empty string. For a conditional group with only one branch, the
7794
            value of group_return will not show "could be empty", so we must
7795
            check that separately. */
7796
7797
133k
            if (lengthptr == NULL)
7798
66.2k
              {
7799
66.2k
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7800
66.2k
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7801
473
                *bracode = OP_SCOND;
7802
66.2k
              }
7803
7804
            /* Handle possessive quantifiers. */
7805
7806
133k
            if (possessive_quantifier)
7807
112k
              {
7808
              /* For COND brackets, we wrap the whole thing in a possessively
7809
              repeated non-capturing bracket, because we have not invented POS
7810
              versions of the COND opcodes. */
7811
7812
112k
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7813
246
                {
7814
246
                int nlen = (int)(code - bracode);
7815
246
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7816
246
                code += 1 + LINK_SIZE;
7817
246
                nlen += 1 + LINK_SIZE;
7818
246
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7819
246
                *code++ = OP_KETRPOS;
7820
246
                PUTINC(code, 0, nlen);
7821
246
                PUT(bracode, 1, nlen);
7822
246
                }
7823
7824
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7825
7826
112k
              else
7827
112k
                {
7828
112k
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7829
112k
                *ketcode = OP_KETRPOS;
7830
112k
                }
7831
7832
              /* If the minimum is zero, mark it as possessive, then unset the
7833
              possessive flag when the minimum is 0 or 1. */
7834
7835
112k
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7836
112k
              if (repeat_min < 2) possessive_quantifier = FALSE;
7837
112k
              }
7838
7839
            /* Non-possessive quantifier */
7840
7841
20.8k
            else *ketcode = OP_KETRMAX + repeat_type;
7842
133k
            }
7843
135k
          }
7844
189k
        }
7845
189k
      break;
7846
7847
      /* If previous was a character type match (\d or similar), abolish it and
7848
      create a suitable repeat item. The code is shared with single-character
7849
      repeats by setting op_type to add a suitable offset into repeat_type.
7850
      Note the the Unicode property types will be present only when
7851
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7852
      here because it just makes it horribly messy. */
7853
7854
189k
      default:
7855
7856
      /* LCOV_EXCL_START */
7857
125k
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7858
0
        {
7859
0
        PCRE2_DEBUG_UNREACHABLE();
7860
0
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7861
0
        return 0;
7862
0
        }
7863
      /* LCOV_EXCL_STOP */
7864
7865
125k
        {
7866
125k
        int prop_type, prop_value;
7867
125k
        PCRE2_UCHAR *oldcode;
7868
7869
125k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7870
7871
125k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7872
125k
        mclength = 0;                         /* Not a character */
7873
7874
125k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7875
24.5k
          {
7876
24.5k
          prop_type = previous[1];
7877
24.5k
          prop_value = previous[2];
7878
24.5k
          }
7879
100k
        else
7880
100k
          {
7881
          /* Come here from just above with a character in mcbuffer/mclength.
7882
          You must also set op_type before the jump. */
7883
358k
          OUTPUT_SINGLE_REPEAT:
7884
358k
          prop_type = prop_value = -1;
7885
358k
          }
7886
7887
        /* At this point, if prop_type == prop_value == -1 we either have a
7888
        character in mcbuffer when mclength is greater than zero, or we have
7889
        mclength zero, in which case there is a non-property character type in
7890
        op_previous. If prop_type/value are not negative, we have a property
7891
        character type in op_previous. */
7892
7893
383k
        oldcode = code;                   /* Save where we were */
7894
383k
        code = previous;                  /* Usually overwrite previous item */
7895
7896
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7897
        this case, so we do too - by simply omitting the item altogether. */
7898
7899
383k
        if (repeat_max == 0) goto END_REPEAT;
7900
7901
        /* Combine the op_type with the repeat_type */
7902
7903
382k
        repeat_type += op_type;
7904
7905
        /* A minimum of zero is handled either as the special case * or ?, or as
7906
        an UPTO, with the maximum given. */
7907
7908
382k
        if (repeat_min == 0)
7909
216k
          {
7910
216k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7911
101k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7912
7.67k
          else
7913
7.67k
            {
7914
7.67k
            *code++ = OP_UPTO + repeat_type;
7915
7.67k
            PUT2INC(code, 0, repeat_max);
7916
7.67k
            }
7917
216k
          }
7918
7919
        /* A repeat minimum of 1 is optimized into some special cases. If the
7920
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7921
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7922
        one less than the maximum. */
7923
7924
165k
        else if (repeat_min == 1)
7925
143k
          {
7926
143k
          if (repeat_max == REPEAT_UNLIMITED)
7927
140k
            *code++ = OP_PLUS + repeat_type;
7928
3.52k
          else
7929
3.52k
            {
7930
3.52k
            code = oldcode;  /* Leave previous item in place */
7931
3.52k
            if (repeat_max == 1) goto END_REPEAT;
7932
3.52k
            *code++ = OP_UPTO + repeat_type;
7933
3.52k
            PUT2INC(code, 0, repeat_max - 1);
7934
3.52k
            }
7935
143k
          }
7936
7937
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7938
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7939
7940
22.0k
        else
7941
22.0k
          {
7942
22.0k
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7943
22.0k
          PUT2INC(code, 0, repeat_min);
7944
7945
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7946
          and then generate the second opcode. For a repeated Unicode property
7947
          match, there are two extra values that define the required property,
7948
          and mclength is set zero to indicate this. */
7949
7950
22.0k
          if (repeat_max != repeat_min)
7951
11.0k
            {
7952
11.0k
            if (mclength > 0)
7953
7.08k
              {
7954
7.08k
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7955
7.08k
              code += mclength;
7956
7.08k
              }
7957
3.93k
            else
7958
3.93k
              {
7959
3.93k
              *code++ = op_previous;
7960
3.93k
              if (prop_type >= 0)
7961
1.14k
                {
7962
1.14k
                *code++ = prop_type;
7963
1.14k
                *code++ = prop_value;
7964
1.14k
                }
7965
3.93k
              }
7966
7967
            /* Now set up the following opcode */
7968
7969
11.0k
            if (repeat_max == REPEAT_UNLIMITED)
7970
3.46k
              *code++ = OP_STAR + repeat_type;
7971
7.55k
            else
7972
7.55k
              {
7973
7.55k
              repeat_max -= repeat_min;
7974
7.55k
              if (repeat_max == 1)
7975
1.09k
                {
7976
1.09k
                *code++ = OP_QUERY + repeat_type;
7977
1.09k
                }
7978
6.46k
              else
7979
6.46k
                {
7980
6.46k
                *code++ = OP_UPTO + repeat_type;
7981
6.46k
                PUT2INC(code, 0, repeat_max);
7982
6.46k
                }
7983
7.55k
              }
7984
11.0k
            }
7985
22.0k
          }
7986
7987
        /* Fill in the character or character type for the final opcode. */
7988
7989
382k
        if (mclength > 0)
7990
257k
          {
7991
257k
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7992
257k
          code += mclength;
7993
257k
          }
7994
124k
        else
7995
124k
          {
7996
124k
          *code++ = op_previous;
7997
124k
          if (prop_type >= 0)
7998
24.5k
            {
7999
24.5k
            *code++ = prop_type;
8000
24.5k
            *code++ = prop_value;
8001
24.5k
            }
8002
124k
          }
8003
382k
        }
8004
0
      break;
8005
614k
      }  /* End of switch on different op_previous values */
8006
8007
8008
    /* If the character following a repeat is '+', possessive_quantifier is
8009
    TRUE. For some opcodes, there are special alternative opcodes for this
8010
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
8011
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
8012
    Sun's Java package, but the special opcodes can optimize it.
8013
8014
    Some (but not all) possessively repeated subpatterns have already been
8015
    completely handled in the code just above. For them, possessive_quantifier
8016
    is always FALSE at this stage. Note that the repeated item starts at
8017
    tempcode, not at previous, which might be the first part of a string whose
8018
    (former) last char we repeated. */
8019
8020
611k
    if (possessive_quantifier)
8021
28.4k
      {
8022
28.4k
      int len;
8023
8024
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
8025
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
8026
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
8027
      remains is greater than zero, there's a further opcode that can be
8028
      handled. If not, do nothing, leaving the EXACT alone. */
8029
8030
28.4k
      switch(*tempcode)
8031
28.4k
        {
8032
1.10k
        case OP_TYPEEXACT:
8033
1.10k
        tempcode += PRIV(OP_lengths)[*tempcode] +
8034
1.10k
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
8035
858
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
8036
1.10k
        break;
8037
8038
        /* CHAR opcodes are used for exacts whose count is 1. */
8039
8040
512
        case OP_CHAR:
8041
829
        case OP_CHARI:
8042
1.04k
        case OP_NOT:
8043
1.25k
        case OP_NOTI:
8044
2.61k
        case OP_EXACT:
8045
3.17k
        case OP_EXACTI:
8046
3.46k
        case OP_NOTEXACT:
8047
3.69k
        case OP_NOTEXACTI:
8048
3.69k
        tempcode += PRIV(OP_lengths)[*tempcode];
8049
3.69k
#ifdef SUPPORT_UNICODE
8050
3.69k
        if (utf && HAS_EXTRALEN(tempcode[-1]))
8051
256
          tempcode += GET_EXTRALEN(tempcode[-1]);
8052
3.69k
#endif
8053
3.69k
        break;
8054
8055
        /* For the class opcodes, the repeat operator appears at the end;
8056
        adjust tempcode to point to it. */
8057
8058
453
        case OP_CLASS:
8059
819
        case OP_NCLASS:
8060
819
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
8061
819
        break;
8062
8063
0
#ifdef SUPPORT_WIDE_CHARS
8064
2.49k
        case OP_XCLASS:
8065
2.69k
        case OP_ECLASS:
8066
2.69k
        tempcode += GET(tempcode, 1);
8067
2.69k
        break;
8068
0
#endif
8069
8070
653
        case OP_REF:
8071
991
        case OP_REFI:
8072
1.09k
        case OP_DNREF:
8073
1.40k
        case OP_DNREFI:
8074
1.40k
        tempcode += PRIV(OP_lengths)[*tempcode];
8075
1.40k
        break;
8076
28.4k
        }
8077
8078
      /* If tempcode is equal to code (which points to the end of the repeated
8079
      item), it means we have skipped an EXACT item but there is no following
8080
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
8081
      all other cases, tempcode will be pointing to the repeat opcode, and will
8082
      be less than code, so the value of len will be greater than 0. */
8083
8084
28.4k
      len = (int)(code - tempcode);
8085
28.4k
      if (len > 0)
8086
26.9k
        {
8087
26.9k
        unsigned int repcode = *tempcode;
8088
8089
        /* There is a table for possessifying opcodes, all of which are less
8090
        than OP_CALLOUT. A zero entry means there is no possessified version.
8091
        */
8092
8093
26.9k
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
8094
21.5k
          *tempcode = opcode_possessify[repcode];
8095
8096
        /* For opcode without a special possessified version, wrap the item in
8097
        ONCE brackets. */
8098
8099
5.34k
        else
8100
5.34k
          {
8101
5.34k
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
8102
5.34k
          code += 1 + LINK_SIZE;
8103
5.34k
          len += 1 + LINK_SIZE;
8104
5.34k
          tempcode[0] = OP_ONCE;
8105
5.34k
          *code++ = OP_KET;
8106
5.34k
          PUTINC(code, 0, len);
8107
5.34k
          PUT(tempcode, 1, len);
8108
5.34k
          }
8109
26.9k
        }
8110
28.4k
      }
8111
8112
    /* We set the "follows varying string" flag for subsequently encountered
8113
    reqcus if it isn't already set and we have just passed a varying length
8114
    item. */
8115
8116
614k
    END_REPEAT:
8117
614k
    cb->req_varyopt |= reqvary;
8118
614k
    break;
8119
8120
8121
    /* ===================================================================*/
8122
    /* Handle a 32-bit data character with a value greater than META_END. */
8123
8124
0
    case META_BIGVALUE:
8125
0
    pptr++;
8126
0
    goto NORMAL_CHAR;
8127
8128
8129
    /* ===============================================================*/
8130
    /* Handle a back reference by number, which is the meta argument. The
8131
    pattern offsets for back references to group numbers less than 10 are held
8132
    in a special vector, to avoid using more than two parsed pattern elements
8133
    in 64-bit environments. We only need the offset to the first occurrence,
8134
    because if that doesn't fail, subsequent ones will also be OK. */
8135
8136
14.7k
    case META_BACKREF:
8137
14.7k
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8138
503
      else GETPLUSOFFSET(offset, pptr);
8139
8140
14.7k
    if (meta_arg > cb->bracount)
8141
955
      {
8142
955
      cb->erroroffset = offset;
8143
955
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8144
955
      return 0;
8145
955
      }
8146
8147
    /* Come here from named backref handling when the reference is to a
8148
    single group (that is, not to a duplicated name). The back reference
8149
    data will have already been updated. We must disable firstcu if not
8150
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8151
    later. */
8152
8153
14.4k
    HANDLE_SINGLE_REFERENCE:
8154
14.4k
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8155
14.4k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8156
14.4k
    PUT2INC(code, 0, meta_arg);
8157
14.4k
    if ((options & PCRE2_CASELESS) != 0)
8158
5.09k
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8159
4.68k
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
8160
5.09k
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
8161
5.09k
                 REFI_FLAG_TURKISH_CASING : 0);
8162
8163
    /* Update the map of back references, and keep the highest one. We
8164
    could do this in parse_regex() for numerical back references, but not
8165
    for named back references, because we don't know the numbers to which
8166
    named back references refer. So we do it all in this function. */
8167
8168
14.4k
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8169
14.4k
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8170
14.4k
    break;
8171
8172
8173
    /* ===============================================================*/
8174
    /* Handle recursion by inserting the number of the called group (which is
8175
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8176
    scanned and these numbers are replaced by offsets within the pattern. It is
8177
    done like this to avoid problems with forward references and adjusting
8178
    offsets when groups are duplicated and moved (as discovered in previous
8179
    implementations). Note that a recursion does not have a set first
8180
    character. */
8181
8182
43.9k
    case META_RECURSE:
8183
43.9k
    GETPLUSOFFSET(offset, pptr);
8184
43.9k
    if (meta_arg > cb->bracount)
8185
246
      {
8186
246
      cb->erroroffset = offset;
8187
246
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8188
246
      return 0;
8189
246
      }
8190
44.1k
    HANDLE_NUMERICAL_RECURSION:
8191
44.1k
    *code = OP_RECURSE;
8192
44.1k
    PUT(code, 1, meta_arg);
8193
44.1k
    code += 1 + LINK_SIZE;
8194
    /* Repeat processing requires this information to
8195
    determine the real length in pre-compile phase. */
8196
44.1k
    length_prevgroup = 1 + LINK_SIZE;
8197
8198
44.1k
    if (META_CODE(pptr[1]) == META_OFFSET ||
8199
44.1k
        META_CODE(pptr[1]) == META_CAPTURE_NAME ||
8200
42.6k
        META_CODE(pptr[1]) == META_CAPTURE_NUMBER)
8201
14.1k
      {
8202
14.1k
      recurse_arguments *args;
8203
8204
14.1k
      if (lengthptr != NULL)
8205
7.23k
        {
8206
7.23k
        if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))
8207
52
          return 0;
8208
8209
7.17k
        args = (recurse_arguments*)cb->last_data;
8210
7.17k
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8211
7.17k
        *lengthptr += (args->size * (1 + IMM2_SIZE));
8212
7.17k
        pptr += args->skip_size;
8213
7.17k
        }
8214
6.96k
      else
8215
6.96k
        {
8216
6.96k
        uint16_t *current, *end;
8217
8218
6.96k
        args = (recurse_arguments*)cb->first_data;
8219
6.96k
        PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);
8220
8221
6.96k
        current = (uint16_t*)(args + 1);
8222
6.96k
        end = current + args->size;
8223
6.96k
        PCRE2_ASSERT(end > current);
8224
8225
6.96k
        do
8226
35.6k
          {
8227
35.6k
          code[0] = OP_CREF;
8228
35.6k
          PUT2(code, 1, *current);
8229
35.6k
          code += 1 + IMM2_SIZE;
8230
35.6k
          }
8231
35.6k
        while (++current < end);
8232
8233
6.96k
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8234
6.96k
        pptr += args->skip_size;
8235
6.96k
        cb->first_data = args->header.next;
8236
6.96k
        cb->cx->memctl.free(args, cb->cx->memctl.memory_data);
8237
6.96k
        }
8238
14.1k
      }
8239
8240
44.0k
    groupsetfirstcu = FALSE;
8241
44.0k
    cb->had_recurse = TRUE;
8242
44.0k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8243
44.0k
    zerofirstcu = firstcu;
8244
44.0k
    zerofirstcuflags = firstcuflags;
8245
44.0k
    break;
8246
8247
8248
    /* ===============================================================*/
8249
    /* Handle capturing parentheses; the number is the meta argument. */
8250
8251
206k
    case META_CAPTURE:
8252
206k
    bravalue = OP_CBRA;
8253
206k
    skipunits = IMM2_SIZE;
8254
206k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8255
206k
    cb->lastcapture = meta_arg;
8256
206k
    goto GROUP_PROCESS_NOTE_EMPTY;
8257
8258
8259
    /* ===============================================================*/
8260
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8261
    arranged to be the same as the corresponding OP_values in the default case
8262
    when PCRE2_UCP is not set (which is the only case in which they will appear
8263
    here).
8264
8265
    Note: \Q and \E are never seen here, as they were dealt with in
8266
    parse_pattern(). Neither are numerical back references or recursions, which
8267
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8268
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8269
    META_RECURSE_BYNAME. */
8270
8271
213k
    case META_ESCAPE:
8272
8273
    /* We can test for escape sequences that consume a character because their
8274
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8275
    are ever created. For these sequences, we disable the setting of a first
8276
    character if it hasn't already been set. */
8277
8278
213k
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8279
179k
      {
8280
179k
      matched_char = TRUE;
8281
179k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8282
179k
      }
8283
8284
    /* Set values to reset to if this is followed by a zero repeat. */
8285
8286
213k
    zerofirstcu = firstcu;
8287
213k
    zerofirstcuflags = firstcuflags;
8288
213k
    zeroreqcu = reqcu;
8289
213k
    zeroreqcuflags = reqcuflags;
8290
8291
    /* If Unicode is not supported, \P and \p are not allowed and are
8292
    faulted at parse time, so will never appear here. */
8293
8294
213k
#ifdef SUPPORT_UNICODE
8295
213k
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8296
36.2k
      {
8297
36.2k
      uint32_t ptype = *(++pptr) >> 16;
8298
36.2k
      uint32_t pdata = *pptr & 0xffff;
8299
8300
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8301
      converted to the general characteristic L&. That is, upper, lower, and
8302
      title case letters are all conflated. */
8303
8304
36.2k
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8305
5.24k
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8306
819
        {
8307
819
        ptype = PT_LAMP;
8308
819
        pdata = 0;
8309
819
        }
8310
8311
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8312
      is compiled to [] so as to benefit from the auto-anchoring code. */
8313
8314
36.2k
      if (ptype == PT_ANY)
8315
0
        {
8316
0
        if (meta_arg == ESC_P)
8317
0
          {
8318
0
          *code++ = OP_CLASS;
8319
0
          memset(code, 0, 32);
8320
0
          code += 32 / sizeof(PCRE2_UCHAR);
8321
0
          }
8322
0
        else
8323
0
          *code++ = OP_ALLANY;
8324
0
        }
8325
36.2k
      else
8326
36.2k
        {
8327
36.2k
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8328
36.2k
        *code++ = ptype;
8329
36.2k
        *code++ = pdata;
8330
36.2k
        }
8331
36.2k
      break;  /* End META_ESCAPE */
8332
36.2k
      }
8333
177k
#endif
8334
8335
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8336
    done. However, there's an option, in case anyone was relying on it. */
8337
8338
177k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8339
5
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8340
5
      {
8341
5
      *errorcodeptr = ERR99;
8342
5
      return 0;
8343
5
      }
8344
8345
    /* For the rest (including \X when Unicode is supported - if not it's
8346
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8347
    not set; if it is set, most of them do not show up here because they are
8348
    converted into Unicode property tests in parse_regex().
8349
8350
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8351
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8352
    There are special UCP codes for \B and \b which are used in UCP mode unless
8353
    "word" matching is being forced to ASCII.
8354
8355
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8356
    if it does. */
8357
8358
177k
    switch(meta_arg)
8359
177k
      {
8360
0
      case ESC_C:
8361
0
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8362
#if PCRE2_CODE_UNIT_WIDTH == 32
8363
      meta_arg = OP_ALLANY;
8364
      (void)utf; /* Avoid compiler warning. */
8365
#else
8366
0
      if (!utf) meta_arg = OP_ALLANY;
8367
0
#endif
8368
0
      break;
8369
8370
6.76k
      case ESC_B:
8371
13.2k
      case ESC_b:
8372
13.2k
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8373
3.52k
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8374
3.52k
          OP_UCP_WORD_BOUNDARY;
8375
13.2k
      PCRE2_FALLTHROUGH /* Fall through */
8376
8377
18.7k
      case ESC_A:
8378
18.7k
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8379
18.7k
      break;
8380
8381
2.04k
      case ESC_K:
8382
2.04k
      cb->external_flags |= PCRE2_HASBSK;  /* Record */
8383
2.04k
      break;
8384
177k
      }
8385
8386
177k
    *code++ = meta_arg;
8387
177k
    break;  /* End META_ESCAPE */
8388
8389
8390
    /* ===================================================================*/
8391
    /* Handle an unrecognized meta value. A parsed pattern value less than
8392
    META_END is a literal. Otherwise we have a problem. */
8393
8394
8.85M
    default:
8395
    /* LCOV_EXCL_START */
8396
8.85M
    if (meta >= META_END)
8397
0
      {
8398
0
      PCRE2_DEBUG_UNREACHABLE();
8399
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8400
0
      return 0;
8401
0
      }
8402
    /* LCOV_EXCL_STOP */
8403
8404
    /* Handle a literal character. We come here by goto in the case of a
8405
    32-bit, non-UTF character whose value is greater than META_END. */
8406
8407
8.85M
    NORMAL_CHAR:
8408
8.85M
    meta = *pptr;     /* Get the full 32 bits */
8409
8.85M
    NORMAL_CHAR_SET:  /* Character is already in meta */
8410
8.85M
    matched_char = TRUE;
8411
8412
    /* For caseless UTF or UCP mode, check whether this character has more than
8413
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8414
    When casing restrictions apply, ignore caseless sets that start with an
8415
    ASCII character. If the character is affected by the special Turkish rules,
8416
    hardcode the matching characters using a caseset. */
8417
8418
8.85M
#ifdef SUPPORT_UNICODE
8419
8.85M
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8420
972k
      {
8421
972k
      uint32_t caseset;
8422
8423
972k
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8424
972k
            PCRE2_EXTRA_TURKISH_CASING &&
8425
0
          UCD_ANY_I(meta))
8426
0
        {
8427
0
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8428
0
        }
8429
972k
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8430
8.74k
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8431
748
               PRIV(ucd_caseless_sets)[caseset] < 128)
8432
510
        {
8433
510
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8434
510
        }
8435
8436
972k
      if (caseset != 0)
8437
8.23k
        {
8438
8.23k
        *code++ = OP_PROP;
8439
8.23k
        *code++ = PT_CLIST;
8440
8.23k
        *code++ = caseset;
8441
8.23k
        if (firstcuflags == REQ_UNSET)
8442
1.77k
          firstcuflags = zerofirstcuflags = REQ_NONE;
8443
8.23k
        break;  /* End handling this meta item */
8444
8.23k
        }
8445
972k
      }
8446
8.85M
#endif
8447
8448
    /* Caseful matches, or caseless and not one of the multicase characters. We
8449
    come here by goto in the case of a positive class that contains only
8450
    case-partners of a character with just two cases; matched_char has already
8451
    been set TRUE and options fudged if necessary. */
8452
8453
8.85M
    CLASS_CASELESS_CHAR:
8454
8455
    /* Get the character's code units into mcbuffer, with the length in
8456
    mclength. When not in UTF mode, the length is always 1. */
8457
8458
8.85M
#ifdef SUPPORT_UNICODE
8459
8.85M
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8460
7.86M
#endif
8461
7.86M
      {
8462
7.86M
      mclength = 1;
8463
7.86M
      mcbuffer[0] = meta;
8464
7.86M
      }
8465
8466
    /* Generate the appropriate code */
8467
8468
8.85M
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8469
8.85M
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8470
8.85M
    code += mclength;
8471
8472
    /* Remember if \r or \n were seen */
8473
8474
8.85M
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8475
19.0k
      cb->external_flags |= PCRE2_HASCRORLF;
8476
8477
    /* Set the first and required code units appropriately. If no previous
8478
    first code unit, set it from this character, but revert to none on a zero
8479
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8480
    a zero repeat. */
8481
8482
8.85M
    if (firstcuflags == REQ_UNSET)
8483
267k
      {
8484
267k
      zerofirstcuflags = REQ_NONE;
8485
267k
      zeroreqcu = reqcu;
8486
267k
      zeroreqcuflags = reqcuflags;
8487
8488
      /* If the character is more than one code unit long, we can set a single
8489
      firstcu only if it is not to be matched caselessly. Multiple possible
8490
      starting code units may be picked up later in the studying code. */
8491
8492
267k
      if (mclength == 1 || req_caseopt == 0)
8493
266k
        {
8494
266k
        firstcu = mcbuffer[0];
8495
266k
        firstcuflags = req_caseopt;
8496
266k
        if (mclength != 1)
8497
992
          {
8498
992
          reqcu = code[-1];
8499
992
          reqcuflags = cb->req_varyopt;
8500
992
          }
8501
266k
        }
8502
805
      else firstcuflags = reqcuflags = REQ_NONE;
8503
267k
      }
8504
8505
    /* firstcu was previously set; we can set reqcu only if the length is
8506
    1 or the matching is caseful. */
8507
8508
8.58M
    else
8509
8.58M
      {
8510
8.58M
      zerofirstcu = firstcu;
8511
8.58M
      zerofirstcuflags = firstcuflags;
8512
8.58M
      zeroreqcu = reqcu;
8513
8.58M
      zeroreqcuflags = reqcuflags;
8514
8.58M
      if (mclength == 1 || req_caseopt == 0)
8515
8.58M
        {
8516
8.58M
        reqcu = code[-1];
8517
8.58M
        reqcuflags = req_caseopt | cb->req_varyopt;
8518
8.58M
        }
8519
8.58M
      }
8520
8521
    /* If caselessness was temporarily instated, reset it. */
8522
8523
8.85M
    if (reset_caseful)
8524
457
      {
8525
457
      options &= ~PCRE2_CASELESS;
8526
457
      req_caseopt = 0;
8527
457
      reset_caseful = FALSE;
8528
457
      }
8529
8530
8.85M
    break;    /* End literal character handling */
8531
12.4M
    }         /* End of big switch */
8532
12.4M
  }           /* End of big loop */
8533
8534
/* LCOV_EXCL_START */
8535
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8536
0
return 0;                  /* Avoid compiler warnings */
8537
/* LCOV_EXCL_STOP */
8538
662k
}
8539
8540
8541
8542
/*************************************************
8543
*   Compile regex: a sequence of alternatives    *
8544
*************************************************/
8545
8546
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8547
the closing bracket or META_END. The code variable is pointing at the code unit
8548
into which the BRA operator has been stored. This function is used during the
8549
pre-compile phase when we are trying to find out the amount of memory needed,
8550
as well as during the real compile phase. The value of lengthptr distinguishes
8551
the two phases.
8552
8553
Arguments:
8554
  options           option bits, including any changes for this subpattern
8555
  xoptions          extra option bits, ditto
8556
  codeptr           -> the address of the current code pointer
8557
  pptrptr           -> the address of the current parsed pattern pointer
8558
  errorcodeptr      -> pointer to error code variable
8559
  skipunits         skip this many code units at start (for brackets and OP_COND)
8560
  firstcuptr        place to put the first required code unit
8561
  firstcuflagsptr   place to put the first code unit flags
8562
  reqcuptr          place to put the last required code unit
8563
  reqcuflagsptr     place to put the last required code unit flags
8564
  bcptr             pointer to the chain of currently open branches
8565
  cb                points to the data block with tables pointers etc.
8566
  lengthptr         NULL during the real compile phase
8567
                    points to length accumulator during pre-compile phase
8568
8569
Returns:            0 There has been an error
8570
                   +1 Success, this group must match at least one character
8571
                   -1 Success, this group may match an empty string
8572
*/
8573
8574
static int
8575
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8576
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8577
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8578
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8579
  compile_block *cb, PCRE2_SIZE *lengthptr)
8580
475k
{
8581
475k
PCRE2_UCHAR *code = *codeptr;
8582
475k
PCRE2_UCHAR *last_branch = code;
8583
475k
PCRE2_UCHAR *start_bracket = code;
8584
475k
BOOL lookbehind;
8585
475k
open_capitem capitem;
8586
475k
int capnumber = 0;
8587
475k
int okreturn = 1;
8588
475k
uint32_t *pptr = *pptrptr;
8589
475k
uint32_t firstcu, reqcu;
8590
475k
uint32_t lookbehindlength;
8591
475k
uint32_t lookbehindminlength;
8592
475k
uint32_t firstcuflags, reqcuflags;
8593
475k
PCRE2_SIZE length;
8594
475k
branch_chain bc;
8595
8596
/* If set, call the external function that checks for stack availability. */
8597
8598
475k
if (cb->cx->stack_guard != NULL &&
8599
0
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8600
0
  {
8601
0
  *errorcodeptr= ERR33;
8602
0
  cb->erroroffset = 0;
8603
0
  return 0;
8604
0
  }
8605
8606
/* Miscellaneous initialization */
8607
8608
475k
bc.outer = bcptr;
8609
475k
bc.current_branch = code;
8610
8611
475k
firstcu = reqcu = 0;
8612
475k
firstcuflags = reqcuflags = REQ_UNSET;
8613
8614
/* Accumulate the length for use in the pre-compile phase. Start with the
8615
length of the BRA and KET and any extra code units that are required at the
8616
beginning. We accumulate in a local variable to save frequent testing of
8617
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8618
start and end of each alternative, because compiled items are discarded during
8619
the pre-compile phase so that the workspace is not exceeded. */
8620
8621
475k
length = 2 + 2*LINK_SIZE + skipunits;
8622
8623
/* Remember if this is a lookbehind assertion, and if it is, save its length
8624
and skip over the pattern offset. */
8625
8626
475k
lookbehind = *code == OP_ASSERTBACK ||
8627
468k
             *code == OP_ASSERTBACK_NOT ||
8628
461k
             *code == OP_ASSERTBACK_NA;
8629
8630
475k
if (lookbehind)
8631
16.8k
  {
8632
16.8k
  lookbehindlength = META_DATA(pptr[-1]);
8633
16.8k
  lookbehindminlength = *pptr;
8634
16.8k
  pptr += SIZEOFFSET;
8635
16.8k
  }
8636
458k
else lookbehindlength = lookbehindminlength = 0;
8637
8638
/* If this is a capturing subpattern, add to the chain of open capturing items
8639
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8640
need be tested here; changing this opcode to one of its variants, e.g.
8641
OP_SCBRAPOS, happens later, after the group has been compiled. */
8642
8643
475k
if (*code == OP_CBRA)
8644
206k
  {
8645
206k
  capnumber = GET2(code, 1 + LINK_SIZE);
8646
206k
  capitem.number = capnumber;
8647
206k
  capitem.next = open_caps;
8648
206k
  capitem.assert_depth = cb->assert_depth;
8649
206k
  open_caps = &capitem;
8650
206k
  }
8651
8652
/* Offset is set zero to mark that this bracket is still open */
8653
8654
475k
PUT(code, 1, 0);
8655
475k
code += 1 + LINK_SIZE + skipunits;
8656
8657
/* Loop for each alternative branch */
8658
8659
475k
for (;;)
8660
662k
  {
8661
662k
  int branch_return;
8662
662k
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8663
662k
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8664
8665
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8666
  is only a single minimum length for the whole assertion. When the minimum
8667
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8668
  though not necessarily the same length. In this case, the original OP_REVERSE
8669
  can be used. It can also be used if a branch in a variable length lookbehind
8670
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8671
  maximum and minimum values. */
8672
8673
662k
  if (lookbehind && lookbehindlength > 0)
8674
20.4k
    {
8675
20.4k
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8676
9.77k
        lookbehindminlength == lookbehindlength)
8677
12.1k
      {
8678
12.1k
      *code++ = OP_REVERSE;
8679
12.1k
      PUT2INC(code, 0, lookbehindlength);
8680
12.1k
      length += 1 + IMM2_SIZE;
8681
12.1k
      }
8682
8.24k
    else
8683
8.24k
      {
8684
8.24k
      *code++ = OP_VREVERSE;
8685
8.24k
      PUT2INC(code, 0, lookbehindminlength);
8686
8.24k
      PUT2INC(code, 0, lookbehindlength);
8687
8.24k
      length += 1 + 2*IMM2_SIZE;
8688
8.24k
      }
8689
20.4k
    }
8690
8691
  /* Now compile the branch; in the pre-compile phase its length gets added
8692
  into the length. */
8693
8694
662k
  if ((branch_return =
8695
662k
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8696
662k
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8697
662k
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8698
2.38k
    return 0;
8699
8700
  /* If a branch can match an empty string, so can the whole group. */
8701
8702
659k
  if (branch_return < 0) okreturn = -1;
8703
8704
  /* In the real compile phase, there is some post-processing to be done. */
8705
8706
659k
  if (lengthptr == NULL)
8707
325k
    {
8708
    /* If this is the first branch, the firstcu and reqcu values for the
8709
    branch become the values for the regex. */
8710
8711
325k
    if (*last_branch != OP_ALT)
8712
234k
      {
8713
234k
      firstcu = branchfirstcu;
8714
234k
      firstcuflags = branchfirstcuflags;
8715
234k
      reqcu = branchreqcu;
8716
234k
      reqcuflags = branchreqcuflags;
8717
234k
      }
8718
8719
    /* If this is not the first branch, the first char and reqcu have to
8720
    match the values from all the previous branches, except that if the
8721
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8722
    and we set REQ_VARY for the group from this branch's value. */
8723
8724
91.0k
    else
8725
91.0k
      {
8726
      /* If we previously had a firstcu, but it doesn't match the new branch,
8727
      we have to abandon the firstcu for the regex, but if there was
8728
      previously no reqcu, it takes on the value of the old firstcu. */
8729
8730
91.0k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8731
63.8k
        {
8732
63.8k
        if (firstcuflags < REQ_NONE)
8733
15.3k
          {
8734
15.3k
          if (reqcuflags >= REQ_NONE)
8735
3.73k
            {
8736
3.73k
            reqcu = firstcu;
8737
3.73k
            reqcuflags = firstcuflags;
8738
3.73k
            }
8739
15.3k
          }
8740
63.8k
        firstcuflags = REQ_NONE;
8741
63.8k
        }
8742
8743
      /* If we (now or from before) have no firstcu, a firstcu from the
8744
      branch becomes a reqcu if there isn't a branch reqcu. */
8745
8746
91.0k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8747
30.7k
          branchreqcuflags >= REQ_NONE)
8748
9.70k
        {
8749
9.70k
        branchreqcu = branchfirstcu;
8750
9.70k
        branchreqcuflags = branchfirstcuflags;
8751
9.70k
        }
8752
8753
      /* Now ensure that the reqcus match */
8754
8755
91.0k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8756
35.4k
          reqcu != branchreqcu)
8757
69.6k
        reqcuflags = REQ_NONE;
8758
21.4k
      else
8759
21.4k
        {
8760
21.4k
        reqcu = branchreqcu;
8761
21.4k
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8762
21.4k
        }
8763
91.0k
      }
8764
325k
    }
8765
8766
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8767
  In the real compile phase, go back through the alternative branches and
8768
  reverse the chain of offsets, with the field in the BRA item now becoming an
8769
  offset to the first alternative. If there are no alternatives, it points to
8770
  the end of the group. The length in the terminating ket is always the length
8771
  of the whole bracketed item. Return leaving the pointer at the terminating
8772
  char. */
8773
8774
659k
  if (META_CODE(*pptr) != META_ALT)
8775
473k
    {
8776
473k
    if (lengthptr == NULL)
8777
234k
      {
8778
234k
      uint32_t branch_length = (uint32_t)(code - last_branch);
8779
234k
      do
8780
325k
        {
8781
325k
        uint32_t prev_length = GET(last_branch, 1);
8782
325k
        PUT(last_branch, 1, branch_length);
8783
325k
        branch_length = prev_length;
8784
325k
        last_branch -= branch_length;
8785
325k
        }
8786
325k
      while (branch_length > 0);
8787
234k
      }
8788
8789
    /* Fill in the ket */
8790
8791
473k
    *code = OP_KET;
8792
473k
    PUT(code, 1, (uint32_t)(code - start_bracket));
8793
473k
    code += 1 + LINK_SIZE;
8794
8795
    /* Set values to pass back */
8796
8797
473k
    *codeptr = code;
8798
473k
    *pptrptr = pptr;
8799
473k
    *firstcuptr = firstcu;
8800
473k
    *firstcuflagsptr = firstcuflags;
8801
473k
    *reqcuptr = reqcu;
8802
473k
    *reqcuflagsptr = reqcuflags;
8803
473k
    if (lengthptr != NULL)
8804
238k
      {
8805
238k
      if (OFLOW_MAX - *lengthptr < length)
8806
0
        {
8807
0
        *errorcodeptr = ERR20;
8808
0
        return 0;
8809
0
        }
8810
238k
      *lengthptr += length;
8811
238k
      }
8812
473k
    return okreturn;
8813
473k
    }
8814
8815
  /* Another branch follows. In the pre-compile phase, we can move the code
8816
  pointer back to where it was for the start of the first branch. (That is,
8817
  pretend that each branch is the only one.)
8818
8819
  In the real compile phase, insert an ALT node. Its length field points back
8820
  to the previous branch while the bracket remains open. At the end the chain
8821
  is reversed. It's done like this so that the start of the bracket has a
8822
  zero offset until it is closed, making it possible to detect recursion. */
8823
8824
186k
  if (lengthptr != NULL)
8825
95.6k
    {
8826
95.6k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8827
95.6k
    length += 1 + LINK_SIZE;
8828
95.6k
    }
8829
91.1k
  else
8830
91.1k
    {
8831
91.1k
    *code = OP_ALT;
8832
91.1k
    PUT(code, 1, (int)(code - last_branch));
8833
91.1k
    bc.current_branch = last_branch = code;
8834
91.1k
    code += 1 + LINK_SIZE;
8835
91.1k
    }
8836
8837
  /* Set the maximum lookbehind length for the next branch (if not in a
8838
  lookbehind the value will be zero) and then advance past the vertical bar. */
8839
8840
186k
  lookbehindlength = META_DATA(*pptr);
8841
186k
  pptr++;
8842
186k
  }
8843
8844
/* LCOV_EXCL_START */
8845
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8846
0
return 0;                  /* Avoid compiler warnings */
8847
/* LCOV_EXCL_STOP */
8848
475k
}
8849
8850
8851
8852
/*************************************************
8853
*          Check for anchored pattern            *
8854
*************************************************/
8855
8856
/* Try to find out if this is an anchored regular expression. Consider each
8857
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8858
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8859
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8860
be found, because ^ generates OP_CIRCM in that mode.
8861
8862
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8863
This is the code for \G, which means "match at start of match position, taking
8864
into account the match offset".
8865
8866
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8867
because that will try the rest of the pattern at all possible matching points,
8868
so there is no point trying again.... er ....
8869
8870
.... except when the .* appears inside capturing parentheses, and there is a
8871
subsequent back reference to those parentheses. We haven't enough information
8872
to catch that case precisely.
8873
8874
At first, the best we could do was to detect when .* was in capturing brackets
8875
and the highest back reference was greater than or equal to that level.
8876
However, by keeping a bitmap of the first 31 back references, we can catch some
8877
of the more common cases more precisely.
8878
8879
... A second exception is when the .* appears inside an atomic group, because
8880
this prevents the number of characters it matches from being adjusted.
8881
8882
Arguments:
8883
  code           points to start of the compiled pattern
8884
  bracket_map    a bitmap of which brackets we are inside while testing; this
8885
                   handles up to substring 31; after that we just have to take
8886
                   the less precise approach
8887
  cb             points to the compile data block
8888
  atomcount      atomic group level
8889
  inassert       TRUE if in an assertion
8890
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8891
8892
Returns:     TRUE or FALSE
8893
*/
8894
8895
static BOOL
8896
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8897
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8898
82.1k
{
8899
85.3k
do {
8900
85.3k
   PCRE2_SPTR scode = first_significant_code(
8901
85.3k
     code + PRIV(OP_lengths)[*code], FALSE);
8902
85.3k
   int op = *scode;
8903
8904
   /* Non-capturing brackets */
8905
8906
85.3k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8907
83.2k
       op == OP_SBRA || op == OP_SBRAPOS)
8908
2.33k
     {
8909
2.33k
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8910
1.95k
       return FALSE;
8911
2.33k
     }
8912
8913
   /* Capturing brackets */
8914
8915
83.0k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8916
76.4k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8917
7.08k
     {
8918
7.08k
     int n = GET2(scode, 1+LINK_SIZE);
8919
7.08k
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8920
7.08k
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8921
7.08k
     }
8922
8923
   /* Positive forward assertion */
8924
8925
75.9k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8926
3.44k
     {
8927
3.44k
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8928
3.44k
     }
8929
8930
   /* Condition. If there is no second branch, it can't be anchored. */
8931
8932
72.4k
   else if (op == OP_COND || op == OP_SCOND)
8933
1.89k
     {
8934
1.89k
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8935
148
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8936
129
       return FALSE;
8937
148
     }
8938
8939
   /* Atomic groups */
8940
8941
70.5k
   else if (op == OP_ONCE)
8942
976
     {
8943
976
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8944
885
       return FALSE;
8945
976
     }
8946
8947
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8948
   it isn't in brackets that are or may be referenced or inside an atomic
8949
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8950
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8951
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8952
   There is also an option that disables auto-anchoring. */
8953
8954
69.6k
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8955
65.2k
             op == OP_TYPEPOSSTAR))
8956
6.13k
     {
8957
6.13k
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8958
1.19k
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8959
4.98k
       return FALSE;
8960
6.13k
     }
8961
8962
   /* Check for explicit anchoring */
8963
8964
63.4k
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8965
8966
5.47k
   code += GET(code, 1);
8967
5.47k
   }
8968
82.1k
while (*code == OP_ALT);   /* Loop for each alternative */
8969
2.30k
return TRUE;
8970
82.1k
}
8971
8972
8973
8974
/*************************************************
8975
*         Check for starting with ^ or .*        *
8976
*************************************************/
8977
8978
/* This is called to find out if every branch starts with ^ or .* so that
8979
"first char" processing can be done to speed things up in multiline
8980
matching and for non-DOTALL patterns that start with .* (which must start at
8981
the beginning or after \n). As in the case of is_anchored() (see above), we
8982
have to take account of back references to capturing brackets that contain .*
8983
because in that case we can't make the assumption. Also, the appearance of .*
8984
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8985
or *SKIP does not count, because once again the assumption no longer holds.
8986
8987
Arguments:
8988
  code           points to start of the compiled pattern or a group
8989
  bracket_map    a bitmap of which brackets we are inside while testing; this
8990
                   handles up to substring 31; after that we just have to take
8991
                   the less precise approach
8992
  cb             points to the compile data
8993
  atomcount      atomic group level
8994
  inassert       TRUE if in an assertion
8995
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8996
8997
Returns:         TRUE or FALSE
8998
*/
8999
9000
static BOOL
9001
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
9002
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
9003
45.8k
{
9004
49.8k
do {
9005
49.8k
   PCRE2_SPTR scode = first_significant_code(
9006
49.8k
     code + PRIV(OP_lengths)[*code], FALSE);
9007
49.8k
   int op = *scode;
9008
9009
   /* If we are at the start of a conditional assertion group, *both* the
9010
   conditional assertion *and* what follows the condition must satisfy the test
9011
   for start of line. Other kinds of condition fail. Note that there may be an
9012
   auto-callout at the start of a condition. */
9013
9014
49.8k
   if (op == OP_COND)
9015
1.54k
     {
9016
1.54k
     scode += 1 + LINK_SIZE;
9017
9018
1.54k
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
9019
977
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
9020
9021
1.54k
     switch (*scode)
9022
1.54k
       {
9023
16
       case OP_CREF:
9024
27
       case OP_DNCREF:
9025
82
       case OP_RREF:
9026
82
       case OP_DNRREF:
9027
283
       case OP_FAIL:
9028
283
       case OP_FALSE:
9029
283
       case OP_TRUE:
9030
283
       return FALSE;
9031
9032
1.26k
       default:     /* Assertion */
9033
1.26k
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9034
1.20k
         return FALSE;
9035
131
       do scode += GET(scode, 1); while (*scode == OP_ALT);
9036
58
       scode += 1 + LINK_SIZE;
9037
58
       break;
9038
1.54k
       }
9039
58
     scode = first_significant_code(scode, FALSE);
9040
58
     op = *scode;
9041
58
     }
9042
9043
   /* Non-capturing brackets */
9044
9045
48.4k
   if (op == OP_BRA  || op == OP_BRAPOS ||
9046
47.0k
       op == OP_SBRA || op == OP_SBRAPOS)
9047
1.49k
     {
9048
1.49k
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
9049
1.24k
       return FALSE;
9050
1.49k
     }
9051
9052
   /* Capturing brackets */
9053
9054
46.9k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
9055
42.7k
            op == OP_SCBRA || op == OP_SCBRAPOS)
9056
4.42k
     {
9057
4.42k
     int n = GET2(scode, 1+LINK_SIZE);
9058
4.42k
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
9059
4.42k
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
9060
3.71k
       return FALSE;
9061
4.42k
     }
9062
9063
   /* Positive forward assertions */
9064
9065
42.4k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
9066
1.11k
     {
9067
1.11k
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9068
1.01k
       return FALSE;
9069
1.11k
     }
9070
9071
   /* Atomic brackets */
9072
9073
41.3k
   else if (op == OP_ONCE)
9074
655
     {
9075
655
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
9076
585
       return FALSE;
9077
655
     }
9078
9079
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
9080
   brackets that may be referenced or an assertion, and as long as the pattern
9081
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
9082
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
9083
   i.e. not at the start of a line. There is also an option that disables this
9084
   optimization. */
9085
9086
40.7k
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
9087
6.04k
     {
9088
6.04k
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
9089
2.71k
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
9090
3.37k
       return FALSE;
9091
6.04k
     }
9092
9093
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
9094
   in particular that this includes atomic brackets OP_ONCE because the number
9095
   of characters matched by .* cannot be adjusted inside them. */
9096
9097
34.6k
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
9098
9099
   /* Move on to the next alternative */
9100
9101
5.72k
   code += GET(code, 1);
9102
5.72k
   }
9103
45.8k
while (*code == OP_ALT);  /* Loop for each alternative */
9104
1.71k
return TRUE;
9105
45.8k
}
9106
9107
9108
9109
/*************************************************
9110
*   Scan compiled regex for recursion reference  *
9111
*************************************************/
9112
9113
/* This function scans through a compiled pattern until it finds an instance of
9114
OP_RECURSE.
9115
9116
Arguments:
9117
  code        points to start of expression
9118
  utf         TRUE in UTF mode
9119
9120
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
9121
*/
9122
9123
static PCRE2_UCHAR *
9124
find_recurse(PCRE2_UCHAR *code, BOOL utf)
9125
561k
{
9126
561k
for (;;)
9127
20.4M
  {
9128
20.4M
  PCRE2_UCHAR c = *code;
9129
20.4M
  if (c == OP_END) return NULL;
9130
20.4M
  if (c == OP_RECURSE) return code;
9131
9132
  /* XCLASS is used for classes that cannot be represented just by a bit map.
9133
  This includes negated single high-valued characters. ECLASS is used for
9134
  classes that use set operations internally. CALLOUT_STR is used for
9135
  callouts with string arguments. In each case the length in the table is
9136
  zero; the actual length is stored in the compiled code. */
9137
9138
19.8M
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
9139
19.6M
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
9140
9141
  /* Otherwise, we can get the item's length from the table, except that for
9142
  repeated character types, we have to test for \p and \P, which have an extra
9143
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
9144
  we must add in its length. */
9145
9146
19.6M
  else
9147
19.6M
    {
9148
19.6M
    switch(c)
9149
19.6M
      {
9150
29.3k
      case OP_TYPESTAR:
9151
38.6k
      case OP_TYPEMINSTAR:
9152
95.1k
      case OP_TYPEPLUS:
9153
115k
      case OP_TYPEMINPLUS:
9154
134k
      case OP_TYPEQUERY:
9155
140k
      case OP_TYPEMINQUERY:
9156
141k
      case OP_TYPEPOSSTAR:
9157
142k
      case OP_TYPEPOSPLUS:
9158
143k
      case OP_TYPEPOSQUERY:
9159
143k
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
9160
143k
      break;
9161
9162
1.42k
      case OP_TYPEPOSUPTO:
9163
29.4k
      case OP_TYPEUPTO:
9164
40.3k
      case OP_TYPEMINUPTO:
9165
44.2k
      case OP_TYPEEXACT:
9166
44.2k
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
9167
3.54k
        code += 2;
9168
44.2k
      break;
9169
9170
6.77k
      case OP_MARK:
9171
14.4k
      case OP_COMMIT_ARG:
9172
16.0k
      case OP_PRUNE_ARG:
9173
18.0k
      case OP_SKIP_ARG:
9174
23.2k
      case OP_THEN_ARG:
9175
23.2k
      code += code[1];
9176
23.2k
      break;
9177
19.6M
      }
9178
9179
    /* Add in the fixed length from the table */
9180
9181
19.6M
    code += PRIV(OP_lengths)[c];
9182
9183
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
9184
    be followed by a multi-unit character. The length in the table is a
9185
    minimum, so we have to arrange to skip the extra units. */
9186
9187
19.6M
#ifdef MAYBE_UTF_MULTI
9188
19.6M
    if (utf) switch(c)
9189
4.29M
      {
9190
845k
      case OP_CHAR:
9191
2.28M
      case OP_CHARI:
9192
2.28M
      case OP_NOT:
9193
2.28M
      case OP_NOTI:
9194
2.28M
      case OP_EXACT:
9195
2.29M
      case OP_EXACTI:
9196
2.29M
      case OP_NOTEXACT:
9197
2.29M
      case OP_NOTEXACTI:
9198
2.29M
      case OP_UPTO:
9199
2.29M
      case OP_UPTOI:
9200
2.29M
      case OP_NOTUPTO:
9201
2.29M
      case OP_NOTUPTOI:
9202
2.30M
      case OP_MINUPTO:
9203
2.30M
      case OP_MINUPTOI:
9204
2.30M
      case OP_NOTMINUPTO:
9205
2.30M
      case OP_NOTMINUPTOI:
9206
2.30M
      case OP_POSUPTO:
9207
2.30M
      case OP_POSUPTOI:
9208
2.30M
      case OP_NOTPOSUPTO:
9209
2.30M
      case OP_NOTPOSUPTOI:
9210
2.30M
      case OP_STAR:
9211
2.31M
      case OP_STARI:
9212
2.31M
      case OP_NOTSTAR:
9213
2.31M
      case OP_NOTSTARI:
9214
2.32M
      case OP_MINSTAR:
9215
2.33M
      case OP_MINSTARI:
9216
2.33M
      case OP_NOTMINSTAR:
9217
2.34M
      case OP_NOTMINSTARI:
9218
2.34M
      case OP_POSSTAR:
9219
2.34M
      case OP_POSSTARI:
9220
2.34M
      case OP_NOTPOSSTAR:
9221
2.34M
      case OP_NOTPOSSTARI:
9222
2.34M
      case OP_PLUS:
9223
2.35M
      case OP_PLUSI:
9224
2.35M
      case OP_NOTPLUS:
9225
2.35M
      case OP_NOTPLUSI:
9226
2.36M
      case OP_MINPLUS:
9227
2.37M
      case OP_MINPLUSI:
9228
2.37M
      case OP_NOTMINPLUS:
9229
2.37M
      case OP_NOTMINPLUSI:
9230
2.37M
      case OP_POSPLUS:
9231
2.37M
      case OP_POSPLUSI:
9232
2.37M
      case OP_NOTPOSPLUS:
9233
2.37M
      case OP_NOTPOSPLUSI:
9234
2.38M
      case OP_QUERY:
9235
2.38M
      case OP_QUERYI:
9236
2.39M
      case OP_NOTQUERY:
9237
2.39M
      case OP_NOTQUERYI:
9238
2.40M
      case OP_MINQUERY:
9239
2.42M
      case OP_MINQUERYI:
9240
2.42M
      case OP_NOTMINQUERY:
9241
2.42M
      case OP_NOTMINQUERYI:
9242
2.42M
      case OP_POSQUERY:
9243
2.42M
      case OP_POSQUERYI:
9244
2.42M
      case OP_NOTPOSQUERY:
9245
2.42M
      case OP_NOTPOSQUERYI:
9246
2.42M
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9247
2.42M
      break;
9248
4.29M
      }
9249
#else
9250
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9251
#endif  /* MAYBE_UTF_MULTI */
9252
19.6M
    }
9253
19.8M
  }
9254
561k
}
9255
9256
9257
9258
/*************************************************
9259
*    Check for asserted fixed first code unit    *
9260
*************************************************/
9261
9262
/* During compilation, the "first code unit" settings from forward assertions
9263
are discarded, because they can cause conflicts with actual literals that
9264
follow. However, if we end up without a first code unit setting for an
9265
unanchored pattern, it is worth scanning the regex to see if there is an
9266
initial asserted first code unit. If all branches start with the same asserted
9267
code unit, or with a non-conditional bracket all of whose alternatives start
9268
with the same asserted code unit (recurse ad lib), then we return that code
9269
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9270
REQ_NONE in the flags.
9271
9272
Arguments:
9273
  code       points to start of compiled pattern
9274
  flags      points to the first code unit flags
9275
  inassert   non-zero if in an assertion
9276
9277
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9278
*/
9279
9280
static uint32_t
9281
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9282
49.4k
{
9283
49.4k
uint32_t c = 0;
9284
49.4k
uint32_t cflags = REQ_NONE;
9285
9286
49.4k
*flags = REQ_NONE;
9287
53.8k
do {
9288
53.8k
   uint32_t d;
9289
53.8k
   uint32_t dflags;
9290
53.8k
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9291
50.0k
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9292
53.8k
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9293
53.8k
   PCRE2_UCHAR op = *scode;
9294
9295
53.8k
   switch(op)
9296
53.8k
     {
9297
33.7k
     default:
9298
33.7k
     return 0;
9299
9300
1.18k
     case OP_BRA:
9301
1.28k
     case OP_BRAPOS:
9302
4.90k
     case OP_CBRA:
9303
5.01k
     case OP_SCBRA:
9304
5.15k
     case OP_CBRAPOS:
9305
5.28k
     case OP_SCBRAPOS:
9306
6.87k
     case OP_ASSERT:
9307
8.63k
     case OP_ASSERT_NA:
9308
9.31k
     case OP_ONCE:
9309
9.80k
     case OP_SCRIPT_RUN:
9310
9.80k
     d = find_firstassertedcu(scode, &dflags, inassert +
9311
9.80k
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9312
9.80k
     if (dflags >= REQ_NONE) return 0;
9313
2.54k
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9314
1.04k
       else if (c != d || cflags != dflags) return 0;
9315
2.51k
     break;
9316
9317
2.51k
     case OP_EXACT:
9318
16
     scode += IMM2_SIZE;
9319
16
     PCRE2_FALLTHROUGH /* Fall through */
9320
9321
5.70k
     case OP_CHAR:
9322
6.08k
     case OP_PLUS:
9323
6.42k
     case OP_MINPLUS:
9324
7.35k
     case OP_POSPLUS:
9325
7.35k
     if (inassert == 0) return 0;
9326
4.08k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9327
1.95k
       else if (c != scode[1]) return 0;
9328
4.01k
     break;
9329
9330
4.01k
     case OP_EXACTI:
9331
12
     scode += IMM2_SIZE;
9332
12
     PCRE2_FALLTHROUGH /* Fall through */
9333
9334
2.31k
     case OP_CHARI:
9335
2.59k
     case OP_PLUSI:
9336
2.80k
     case OP_MINPLUSI:
9337
2.89k
     case OP_POSPLUSI:
9338
2.89k
     if (inassert == 0) return 0;
9339
9340
     /* If the character is more than one code unit long, we cannot set its
9341
     first code unit when matching caselessly. Later scanning may pick up
9342
     multiple code units. */
9343
9344
1.66k
#ifdef SUPPORT_UNICODE
9345
#if PCRE2_CODE_UNIT_WIDTH == 8
9346
     if (scode[1] >= 0x80) return 0;
9347
#elif PCRE2_CODE_UNIT_WIDTH == 16
9348
1.66k
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9349
1.65k
#endif
9350
1.65k
#endif
9351
9352
1.65k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9353
1.26k
       else if (c != scode[1]) return 0;
9354
1.62k
     break;
9355
53.8k
     }
9356
9357
8.15k
   code += GET(code, 1);
9358
8.15k
   }
9359
49.4k
while (*code == OP_ALT);
9360
9361
3.77k
*flags = cflags;
9362
3.77k
return c;
9363
49.4k
}
9364
9365
9366
9367
/*************************************************
9368
*             Skip in parsed pattern             *
9369
*************************************************/
9370
9371
/* This function is called to skip parts of the parsed pattern when finding the
9372
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9373
the end of the branch, it is called to skip over an internal lookaround or
9374
(DEFINE) group, and it is also called to skip to the end of a class, during
9375
which it will never encounter nested groups (but there's no need to have
9376
special code for that).
9377
9378
When called to find the end of a branch or group, pptr must point to the first
9379
meta code inside the branch, not the branch-starting code. In other cases it
9380
can point to the item that causes the function to be called.
9381
9382
Arguments:
9383
  pptr       current pointer to skip from
9384
  skiptype   PSKIP_CLASS when skipping to end of class
9385
             PSKIP_ALT when META_ALT ends the skip
9386
             PSKIP_KET when only META_KET ends the skip
9387
9388
Returns:     new value of pptr
9389
             NULL if META_END is reached - should never occur
9390
               or for an unknown meta value - likewise
9391
*/
9392
9393
static uint32_t *
9394
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9395
5.00k
{
9396
5.00k
uint32_t nestlevel = 0;
9397
9398
420k
for (;; pptr++)
9399
425k
  {
9400
425k
  uint32_t meta = META_CODE(*pptr);
9401
9402
425k
  switch(meta)
9403
425k
    {
9404
391k
    default:  /* Just skip over most items */
9405
391k
    if (meta < META_END) continue;  /* Literal */
9406
40.1k
    break;
9407
9408
    /* The parsed regex is malformed; we have reached the end and did
9409
    not find the end of the construct which we are skipping over. */
9410
9411
    /* LCOV_EXCL_START */
9412
40.1k
    case META_END:
9413
0
    PCRE2_DEBUG_UNREACHABLE();
9414
0
    return NULL;
9415
    /* LCOV_EXCL_STOP */
9416
9417
    /* The data for these items is variable in length. */
9418
9419
1.62k
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9420
1.62k
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9421
1.62k
    break;
9422
9423
6.56k
    case META_ESCAPE:
9424
6.56k
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9425
1.37k
      pptr += 1;     /* Skip prop data */
9426
6.56k
    break;
9427
9428
454
    case META_MARK:     /* Add the length of the name. */
9429
888
    case META_COMMIT_ARG:
9430
1.09k
    case META_PRUNE_ARG:
9431
1.49k
    case META_SKIP_ARG:
9432
1.55k
    case META_THEN_ARG:
9433
1.55k
    pptr += pptr[1];
9434
1.55k
    break;
9435
9436
    /* These are the "active" items in this loop. */
9437
9438
5.05k
    case META_CLASS_END:
9439
5.05k
    if (skiptype == PSKIP_CLASS) return pptr;
9440
2.57k
    break;
9441
9442
2.57k
    case META_ATOMIC:
9443
3.18k
    case META_CAPTURE:
9444
3.44k
    case META_COND_ASSERT:
9445
3.44k
    case META_COND_DEFINE:
9446
3.73k
    case META_COND_NAME:
9447
3.80k
    case META_COND_NUMBER:
9448
3.87k
    case META_COND_RNAME:
9449
4.08k
    case META_COND_RNUMBER:
9450
4.08k
    case META_COND_VERSION:
9451
4.14k
    case META_SCS:
9452
4.41k
    case META_LOOKAHEAD:
9453
4.81k
    case META_LOOKAHEADNOT:
9454
5.03k
    case META_LOOKAHEAD_NA:
9455
5.30k
    case META_LOOKBEHIND:
9456
5.71k
    case META_LOOKBEHINDNOT:
9457
5.96k
    case META_LOOKBEHIND_NA:
9458
6.23k
    case META_NOCAPTURE:
9459
6.30k
    case META_SCRIPT_RUN:
9460
6.30k
    nestlevel++;
9461
6.30k
    break;
9462
9463
4.63k
    case META_ALT:
9464
4.63k
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9465
4.44k
    break;
9466
9467
8.63k
    case META_KET:
9468
8.63k
    if (nestlevel == 0) return pptr;
9469
6.30k
    nestlevel--;
9470
6.30k
    break;
9471
425k
    }
9472
9473
  /* The extra data item length for each meta is in a table. */
9474
9475
69.5k
  meta = (meta >> 16) & 0x7fff;
9476
69.5k
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9477
69.5k
  pptr += meta_extra_lengths[meta];
9478
69.5k
  }
9479
9480
/* LCOV_EXCL_START */
9481
5.00k
PCRE2_UNREACHABLE(); /* Control never reaches here */
9482
/* LCOV_EXCL_STOP */
9483
5.00k
}
9484
9485
9486
9487
/*************************************************
9488
*       Find length of a parsed group            *
9489
*************************************************/
9490
9491
/* This is called for nested groups within a branch of a lookbehind whose
9492
length is being computed. On entry, the pointer must be at the first element
9493
after the group initializing code. On exit it points to OP_KET. Caching is used
9494
to improve processing speed when the same capturing group occurs many times.
9495
9496
Arguments:
9497
  pptrptr     pointer to pointer in the parsed pattern
9498
  minptr      where to return the minimum length
9499
  isinline    FALSE if a reference or recursion; TRUE for inline group
9500
  errcodeptr  pointer to the errorcode
9501
  lcptr       pointer to the loop counter
9502
  group       number of captured group or -1 for a non-capturing group
9503
  recurses    chain of recurse_check to catch mutual recursion
9504
  cb          pointer to the compile data
9505
9506
Returns:      the maximum group length or a negative number
9507
*/
9508
9509
static int
9510
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9511
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9512
6.07k
{
9513
6.07k
uint32_t *gi = cb->groupinfo + 2 * group;
9514
6.07k
int branchlength, branchminlength;
9515
6.07k
int grouplength = -1;
9516
6.07k
int groupminlength = INT_MAX;
9517
9518
/* The cache can be used only if there is no possibility of there being two
9519
groups with the same number. We do not need to set the end pointer for a group
9520
that is being processed as a back reference or recursion, but we must do so for
9521
an inline group. */
9522
9523
6.07k
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9524
3.42k
  {
9525
3.42k
  uint32_t groupinfo = gi[0];
9526
3.42k
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9527
3.42k
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9528
1.47k
    {
9529
1.47k
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9530
1.47k
    *minptr = gi[1];
9531
1.47k
    return groupinfo & GI_FIXED_LENGTH_MASK;
9532
1.47k
    }
9533
3.42k
  }
9534
9535
/* Scan the group. In this case we find the end pointer of necessity. */
9536
9537
4.60k
for(;;)
9538
6.60k
  {
9539
6.60k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9540
6.60k
    recurses, cb);
9541
6.60k
  if (branchlength < 0) goto ISNOTFIXED;
9542
6.05k
  if (branchlength > grouplength) grouplength = branchlength;
9543
6.05k
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9544
6.05k
  if (**pptrptr == META_KET) break;
9545
2.00k
  *pptrptr += 1;   /* Skip META_ALT */
9546
2.00k
  }
9547
9548
4.05k
if (group > 0)
9549
1.98k
  {
9550
1.98k
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9551
1.98k
  gi[1] = groupminlength;
9552
1.98k
  }
9553
9554
4.05k
*minptr = groupminlength;
9555
4.05k
return grouplength;
9556
9557
551
ISNOTFIXED:
9558
551
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9559
551
return -1;
9560
4.60k
}
9561
9562
9563
9564
/*************************************************
9565
*        Find length of a parsed branch          *
9566
*************************************************/
9567
9568
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9569
giving an error if the length is not limited. On entry, *pptrptr points to the
9570
first element inside the branch. On exit it is set to point to the ALT or KET.
9571
9572
Arguments:
9573
  pptrptr     pointer to pointer in the parsed pattern
9574
  minptr      where to return the minimum length
9575
  errcodeptr  pointer to error code
9576
  lcptr       pointer to loop counter
9577
  recurses    chain of recurse_check to catch mutual recursion
9578
  cb          pointer to compile block
9579
9580
Returns:      the maximum length, or a negative value on error
9581
*/
9582
9583
static int
9584
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9585
  parsed_recurse_check *recurses, compile_block *cb)
9586
28.5k
{
9587
28.5k
int branchlength = 0;
9588
28.5k
int branchminlength = 0;
9589
28.5k
int grouplength, groupminlength;
9590
28.5k
uint32_t lastitemlength = 0;
9591
28.5k
uint32_t lastitemminlength = 0;
9592
28.5k
uint32_t *pptr = *pptrptr;
9593
28.5k
PCRE2_SIZE offset;
9594
28.5k
parsed_recurse_check this_recurse;
9595
9596
/* A large and/or complex regex can take too long to process. This can happen
9597
more often when (?| groups are present in the pattern because their length
9598
cannot be cached. */
9599
9600
28.5k
if ((*lcptr)++ > 2000)
9601
3
  {
9602
3
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9603
3
  return -1;
9604
3
  }
9605
9606
/* Scan the branch, accumulating the length. */
9607
9608
352k
for (;; pptr++)
9609
381k
  {
9610
381k
  parsed_recurse_check *r;
9611
381k
  uint32_t *gptr, *gptrend;
9612
381k
  uint32_t escape;
9613
381k
  uint32_t min, max;
9614
381k
  uint32_t group = 0;
9615
381k
  uint32_t itemlength = 0;
9616
381k
  uint32_t itemminlength = 0;
9617
9618
381k
  if (*pptr < META_END)
9619
272k
    {
9620
272k
    itemlength = itemminlength = 1;
9621
272k
    }
9622
9623
109k
  else switch (META_CODE(*pptr))
9624
109k
    {
9625
16.0k
    case META_KET:
9626
26.8k
    case META_ALT:
9627
26.8k
    goto EXIT;
9628
9629
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9630
    actual termination. */
9631
9632
343
    case META_ACCEPT:
9633
557
    case META_FAIL:
9634
557
    pptr = parsed_skip(pptr, PSKIP_ALT);
9635
557
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9636
557
    goto EXIT;
9637
9638
557
    case META_MARK:
9639
467
    case META_COMMIT_ARG:
9640
689
    case META_PRUNE_ARG:
9641
1.22k
    case META_SKIP_ARG:
9642
1.32k
    case META_THEN_ARG:
9643
1.32k
    pptr += pptr[1] + 1;
9644
1.32k
    break;
9645
9646
1.12k
    case META_CIRCUMFLEX:
9647
1.38k
    case META_COMMIT:
9648
1.82k
    case META_DOLLAR:
9649
1.94k
    case META_PRUNE:
9650
2.21k
    case META_SKIP:
9651
2.73k
    case META_THEN:
9652
2.73k
    break;
9653
9654
74
    case META_OPTIONS:
9655
74
    pptr += 2;
9656
74
    break;
9657
9658
0
    case META_BIGVALUE:
9659
0
    itemlength = itemminlength = 1;
9660
0
    pptr += 1;
9661
0
    break;
9662
9663
1.55k
    case META_CLASS:
9664
2.48k
    case META_CLASS_NOT:
9665
2.48k
    itemlength = itemminlength = 1;
9666
2.48k
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9667
2.48k
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9668
2.48k
    break;
9669
9670
2.48k
    case META_CLASS_EMPTY_NOT:
9671
632
    case META_DOT:
9672
632
    itemlength = itemminlength = 1;
9673
632
    break;
9674
9675
48.6k
    case META_CALLOUT_NUMBER:
9676
48.6k
    pptr += 3;
9677
48.6k
    break;
9678
9679
78
    case META_CALLOUT_STRING:
9680
78
    pptr += 3 + SIZEOFFSET;
9681
78
    break;
9682
9683
    /* Only some escapes consume a character. Of those, \R can match one or two
9684
    characters, but \X is never allowed because it matches an unknown number of
9685
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9686
9687
6.62k
    case META_ESCAPE:
9688
6.62k
    escape = META_DATA(*pptr);
9689
6.62k
    if (escape == ESC_X) return -1;
9690
6.61k
    if (escape == ESC_R)
9691
261
      {
9692
261
      itemminlength = 1;
9693
261
      itemlength = 2;
9694
261
      }
9695
6.35k
    else if (escape > ESC_b && escape < ESC_Z)
9696
4.69k
      {
9697
4.69k
#if PCRE2_CODE_UNIT_WIDTH != 32
9698
4.69k
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9699
0
        {
9700
0
        *errcodeptr = ERR36;
9701
0
        return -1;
9702
0
        }
9703
4.69k
#endif
9704
4.69k
      itemlength = itemminlength = 1;
9705
4.69k
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9706
4.69k
      }
9707
6.61k
    break;
9708
9709
    /* Lookaheads do not contribute to the length of this branch, but they may
9710
    contain lookbehinds within them whose lengths need to be set. */
9711
9712
6.61k
    case META_LOOKAHEAD:
9713
2.31k
    case META_LOOKAHEADNOT:
9714
3.02k
    case META_LOOKAHEAD_NA:
9715
3.06k
    case META_SCS:
9716
3.06k
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9717
3.06k
    if (*errcodeptr != 0) return -1;
9718
9719
    /* Ignore any qualifiers that follow a lookahead assertion. */
9720
9721
2.93k
    switch (pptr[1])
9722
2.93k
      {
9723
202
      case META_ASTERISK:
9724
298
      case META_ASTERISK_PLUS:
9725
392
      case META_ASTERISK_QUERY:
9726
749
      case META_PLUS:
9727
946
      case META_PLUS_PLUS:
9728
1.03k
      case META_PLUS_QUERY:
9729
1.12k
      case META_QUERY:
9730
1.40k
      case META_QUERY_PLUS:
9731
1.47k
      case META_QUERY_QUERY:
9732
1.47k
      pptr++;
9733
1.47k
      break;
9734
9735
198
      case META_MINMAX:
9736
236
      case META_MINMAX_PLUS:
9737
302
      case META_MINMAX_QUERY:
9738
302
      pptr += 3;
9739
302
      break;
9740
9741
1.16k
      default:
9742
1.16k
      break;
9743
2.93k
      }
9744
2.93k
    break;
9745
9746
    /* A nested lookbehind does not contribute any length to this lookbehind,
9747
    but must itself be checked and have its lengths set. Note that
9748
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9749
    of the group, so no need to update it here. */
9750
9751
2.93k
    case META_LOOKBEHIND:
9752
684
    case META_LOOKBEHINDNOT:
9753
1.00k
    case META_LOOKBEHIND_NA:
9754
1.00k
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9755
98
      return -1;
9756
911
    break;
9757
9758
    /* Back references and recursions are handled by very similar code. At this
9759
    stage, the names generated in the parsing pass are available, but the main
9760
    name table has not yet been created. So for the named varieties, scan the
9761
    list of names in order to get the number of the first one in the pattern,
9762
    and whether or not this name is duplicated. */
9763
9764
911
    case META_BACKREF_BYNAME:
9765
53
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9766
4
      goto ISNOTFIXED;
9767
49
    PCRE2_FALLTHROUGH /* Fall through */
9768
49
9769
161
    case META_RECURSE_BYNAME:
9770
161
      {
9771
161
      PCRE2_SPTR name;
9772
161
      BOOL is_dupname = FALSE;
9773
161
      named_group *ng;
9774
161
      uint32_t meta_code = META_CODE(*pptr);
9775
161
      uint32_t length = *(++pptr);
9776
9777
161
      GETPLUSOFFSET(offset, pptr);
9778
161
      name = cb->start_pattern + offset;
9779
161
      ng = PRIV(compile_find_named_group)(name, length, cb);
9780
9781
161
      if (ng == NULL)
9782
7
        {
9783
7
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9784
7
        cb->erroroffset = offset;
9785
7
        return -1;
9786
7
        }
9787
9788
154
      group = ng->number;
9789
154
      is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;
9790
9791
      /* A numerical back reference can be fixed length if duplicate capturing
9792
      groups are not being used. A non-duplicate named back reference can also
9793
      be handled. */
9794
9795
154
      if (meta_code == META_RECURSE_BYNAME ||
9796
45
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9797
148
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9798
154
      }
9799
6
    goto ISNOTFIXED;                     /* Duplicate name or number */
9800
9801
    /* The offset values for back references < 10 are in a separate vector
9802
    because otherwise they would use more than two parsed pattern elements on
9803
    64-bit systems. */
9804
9805
1.41k
    case META_BACKREF:
9806
1.41k
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9807
1.39k
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9808
24
      goto ISNOTFIXED;
9809
1.39k
    group = META_DATA(*pptr);
9810
1.39k
    if (group < 10)
9811
1.21k
      {
9812
1.21k
      offset = cb->small_ref_offset[group];
9813
1.21k
      goto RECURSE_OR_BACKREF_LENGTH;
9814
1.21k
      }
9815
9816
174
    PCRE2_FALLTHROUGH /* Fall through */
9817
174
    /* For groups >= 10 - picking up group twice does no harm. */
9818
174
9819
174
    /* A true recursion implies not fixed length, but a subroutine call may
9820
174
    be OK. Back reference "recursions" are also failed. */
9821
174
9822
463
    case META_RECURSE:
9823
463
    group = META_DATA(*pptr);
9824
463
    GETPLUSOFFSET(offset, pptr);
9825
9826
1.82k
    RECURSE_OR_BACKREF_LENGTH:
9827
1.82k
    if (group > cb->bracount)
9828
38
      {
9829
38
      cb->erroroffset = offset;
9830
38
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9831
38
      return -1;
9832
38
      }
9833
1.79k
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9834
960k
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9835
960k
      {
9836
960k
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9837
960k
        else if (*gptr == (META_CAPTURE | group)) break;
9838
960k
      }
9839
9840
    /* We must start the search for the end of the group at the first meta code
9841
    inside the group. Otherwise it will be treated as an enclosed group. */
9842
9843
1.78k
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9844
1.78k
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9845
1.78k
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9846
2.22k
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9847
1.69k
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9848
1.68k
    this_recurse.prev = recurses;
9849
1.68k
    this_recurse.groupptr = gptr;
9850
9851
    /* We do not need to know the position of the end of the group, that is,
9852
    gptr is not used after the call to get_grouplength(). Setting the second
9853
    argument FALSE stops it scanning for the end when the length can be found
9854
    in the cache. */
9855
9856
1.68k
    gptr++;
9857
1.68k
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9858
1.68k
      lcptr, group, &this_recurse, cb);
9859
1.68k
    if (grouplength < 0)
9860
78
      {
9861
78
      if (*errcodeptr == 0) goto ISNOTFIXED;
9862
73
      return -1;  /* Error already set */
9863
78
      }
9864
1.60k
    itemlength = grouplength;
9865
1.60k
    itemminlength = groupminlength;
9866
1.60k
    break;
9867
9868
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9869
    the length of this branch. Skip from the following item to the next
9870
    unpaired ket. */
9871
9872
0
    case META_COND_DEFINE:
9873
0
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9874
0
    break;
9875
9876
    /* Check other nested groups - advance past the initial data for each type
9877
    and then seek a fixed length with get_grouplength(). */
9878
9879
230
    case META_COND_NAME:
9880
302
    case META_COND_NUMBER:
9881
373
    case META_COND_RNAME:
9882
796
    case META_COND_RNUMBER:
9883
796
    pptr += 2 + SIZEOFFSET;
9884
796
    goto CHECK_GROUP;
9885
9886
292
    case META_COND_ASSERT:
9887
292
    pptr += 1;
9888
292
    goto CHECK_GROUP;
9889
9890
0
    case META_COND_VERSION:
9891
0
    pptr += 4;
9892
0
    goto CHECK_GROUP;
9893
9894
2.03k
    case META_CAPTURE:
9895
2.03k
    group = META_DATA(*pptr);
9896
2.03k
    PCRE2_FALLTHROUGH /* Fall through */
9897
9898
2.34k
    case META_ATOMIC:
9899
3.23k
    case META_NOCAPTURE:
9900
3.30k
    case META_SCRIPT_RUN:
9901
3.30k
    pptr++;
9902
4.38k
    CHECK_GROUP:
9903
4.38k
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9904
4.38k
      lcptr, group, recurses, cb);
9905
4.38k
    if (grouplength < 0) return -1;
9906
3.91k
    itemlength = grouplength;
9907
3.91k
    itemminlength = groupminlength;
9908
3.91k
    break;
9909
9910
6.22k
    case META_QUERY:
9911
6.47k
    case META_QUERY_PLUS:
9912
6.83k
    case META_QUERY_QUERY:
9913
6.83k
    min = 0;
9914
6.83k
    max = 1;
9915
6.83k
    goto REPETITION;
9916
9917
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9918
    must subtract the length that has already been added. */
9919
9920
1.28k
    case META_MINMAX:
9921
1.51k
    case META_MINMAX_PLUS:
9922
1.78k
    case META_MINMAX_QUERY:
9923
1.78k
    min = pptr[1];
9924
1.78k
    max = pptr[2];
9925
1.78k
    pptr += 2;
9926
9927
8.61k
    REPETITION:
9928
8.61k
    if (max != REPEAT_UNLIMITED)
9929
8.61k
      {
9930
8.61k
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9931
8.25k
          max != 0 &&
9932
8.01k
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9933
5
        {
9934
5
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9935
5
        return -1;
9936
5
        }
9937
8.60k
      if (min == 0) branchminlength -= lastitemminlength;
9938
1.09k
        else itemminlength = (min - 1) * lastitemminlength;
9939
8.60k
      if (max == 0) branchlength -= lastitemlength;
9940
8.37k
        else itemlength = (max - 1) * lastitemlength;
9941
8.60k
      break;
9942
8.61k
      }
9943
3
    PCRE2_FALLTHROUGH /* Fall through */
9944
3
9945
3
    /* Any other item means this branch does not have a fixed length. */
9946
3
9947
146
    default:
9948
291
    ISNOTFIXED:
9949
291
    *errcodeptr = ERR25;   /* Not fixed length */
9950
291
    return -1;
9951
109k
    }
9952
9953
  /* Add the item length to the branchlength, checking for integer overflow and
9954
  for the branch length exceeding the overall limit. Later, if there is at
9955
  least one variable-length branch in the group, there is a test for the
9956
  (smaller) variable-length branch length limit. */
9957
9958
352k
  if (INT_MAX - branchlength < (int)itemlength ||
9959
352k
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9960
17
    {
9961
17
    *errcodeptr = ERR87;
9962
17
    return -1;
9963
17
    }
9964
9965
352k
  branchminlength += itemminlength;
9966
9967
  /* Save this item length for use if the next item is a quantifier. */
9968
9969
352k
  lastitemlength = itemlength;
9970
352k
  lastitemminlength = itemminlength;
9971
352k
  }
9972
9973
27.3k
EXIT:
9974
27.3k
*pptrptr = pptr;
9975
27.3k
*minptr = branchminlength;
9976
27.3k
return branchlength;
9977
9978
/* LCOV_EXCL_START */
9979
0
PARSED_SKIP_FAILED:
9980
0
PCRE2_DEBUG_UNREACHABLE();
9981
0
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9982
0
return -1;
9983
/* LCOV_EXCL_STOP */
9984
28.5k
}
9985
9986
9987
9988
/*************************************************
9989
*        Set lengths in a lookbehind             *
9990
*************************************************/
9991
9992
/* This function is called for each lookbehind, to set the lengths in its
9993
branches. An error occurs if any branch does not have a limited maximum length
9994
that is less than the limit (65535). On exit, the pointer must be left on the
9995
final ket.
9996
9997
The function also maintains the max_lookbehind value. Any lookbehind branch
9998
that contains a nested lookbehind may actually look further back than the
9999
length of the branch. The additional amount is passed back from
10000
get_branchlength() as an "extra" value.
10001
10002
Arguments:
10003
  pptrptr     pointer to pointer in the parsed pattern
10004
  errcodeptr  pointer to error code
10005
  lcptr       pointer to loop counter
10006
  recurses    chain of recurse_check to catch mutual recursion
10007
  cb          pointer to compile block
10008
10009
Returns:      TRUE if all is well
10010
              FALSE otherwise, with error code and offset set
10011
*/
10012
10013
static BOOL
10014
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
10015
  parsed_recurse_check *recurses, compile_block *cb)
10016
12.9k
{
10017
12.9k
PCRE2_SIZE offset;
10018
12.9k
uint32_t *bptr = *pptrptr;
10019
12.9k
uint32_t *gbptr = bptr;
10020
12.9k
int maxlength = 0;
10021
12.9k
int minlength = INT_MAX;
10022
12.9k
BOOL variable = FALSE;
10023
10024
12.9k
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
10025
12.9k
*pptrptr += SIZEOFFSET;
10026
10027
/* Each branch can have a different maximum length, but we can keep only a
10028
single minimum for the whole group, because there's nowhere to save individual
10029
values in the META_ALT item. */
10030
10031
12.9k
do
10032
21.8k
  {
10033
21.8k
  int branchlength, branchminlength;
10034
10035
21.8k
  *pptrptr += 1;
10036
21.8k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
10037
21.8k
    recurses, cb);
10038
10039
21.8k
  if (branchlength < 0)
10040
588
    {
10041
    /* The errorcode and offset may already be set from a nested lookbehind. */
10042
588
    if (*errcodeptr == 0) *errcodeptr = ERR25;
10043
588
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
10044
588
    return FALSE;
10045
588
    }
10046
10047
21.3k
  if (branchlength != branchminlength) variable = TRUE;
10048
21.3k
  if (branchminlength < minlength) minlength = branchminlength;
10049
21.3k
  if (branchlength > maxlength) maxlength = branchlength;
10050
21.3k
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
10051
21.3k
  *bptr |= branchlength;  /* branchlength never more than 65535 */
10052
21.3k
  bptr = *pptrptr;
10053
21.3k
  }
10054
21.3k
while (META_CODE(*bptr) == META_ALT);
10055
10056
/* If any branch is of variable length, the whole lookbehind is of variable
10057
length. If the maximum length of any branch exceeds the maximum for variable
10058
lookbehinds, give an error. Otherwise, the minimum length is set in the word
10059
that follows the original group META value. For a fixed-length lookbehind, this
10060
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
10061
possibly different) length. */
10062
10063
12.3k
if (variable)
10064
3.12k
  {
10065
3.12k
  gbptr[1] = minlength;
10066
3.12k
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
10067
79
    {
10068
79
    *errcodeptr = ERR100;
10069
79
    cb->erroroffset = offset;
10070
79
    return FALSE;
10071
79
    }
10072
3.12k
  }
10073
9.22k
else gbptr[1] = LOOKBEHIND_MAX;
10074
10075
12.2k
return TRUE;
10076
12.3k
}
10077
10078
10079
10080
/*************************************************
10081
*         Check parsed pattern lookbehinds       *
10082
*************************************************/
10083
10084
/* This function is called at the end of parsing a pattern if any lookbehinds
10085
were encountered. It scans the parsed pattern for them, calling
10086
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
10087
the error offset is marked unset. The enables the functions above not to
10088
override settings from deeper nestings.
10089
10090
This function is called recursively from get_branchlength() for lookaheads in
10091
order to process any lookbehinds that they may contain. It stops when it hits a
10092
non-nested closing parenthesis in this case, returning a pointer to it.
10093
10094
Arguments
10095
  pptr      points to where to start (start of pattern or start of lookahead)
10096
  retptr    if not NULL, return the ket pointer here
10097
  recurses  chain of recurse_check to catch mutual recursion
10098
  cb        points to the compile block
10099
  lcptr     points to loop counter
10100
10101
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
10102
*/
10103
10104
static int
10105
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
10106
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
10107
8.67k
{
10108
8.67k
int errorcode = 0;
10109
8.67k
int nestlevel = 0;
10110
10111
8.67k
cb->erroroffset = PCRE2_UNSET;
10112
10113
1.33M
for (; *pptr != META_END; pptr++)
10114
1.33M
  {
10115
1.33M
  if (*pptr < META_END) continue;  /* Literal */
10116
10117
290k
  switch (META_CODE(*pptr))
10118
290k
    {
10119
    /* The following erroroffset is a bogus but safe value. This branch should
10120
    be avoided by providing a proper implementation for all supported cases
10121
    below. */
10122
10123
    /* LCOV_EXCL_START */
10124
3
    default:
10125
3
    PCRE2_DEBUG_UNREACHABLE();
10126
3
    cb->erroroffset = 0;
10127
3
    return ERR70;  /* Unrecognized meta code */
10128
    /* LCOV_EXCL_STOP */
10129
10130
18.6k
    case META_ESCAPE:
10131
18.6k
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
10132
3.05k
      pptr += 1;    /* Skip prop data */
10133
18.6k
    break;
10134
10135
19.8k
    case META_KET:
10136
19.8k
    if (--nestlevel < 0)
10137
2.93k
      {
10138
2.93k
      if (retptr != NULL) *retptr = pptr;
10139
2.93k
      return 0;
10140
2.93k
      }
10141
16.9k
    break;
10142
10143
16.9k
    case META_ATOMIC:
10144
10.4k
    case META_CAPTURE:
10145
11.5k
    case META_COND_ASSERT:
10146
11.6k
    case META_SCS:
10147
12.6k
    case META_LOOKAHEAD:
10148
13.1k
    case META_LOOKAHEADNOT:
10149
13.7k
    case META_LOOKAHEAD_NA:
10150
15.5k
    case META_NOCAPTURE:
10151
15.7k
    case META_SCRIPT_RUN:
10152
15.7k
    nestlevel++;
10153
15.7k
    break;
10154
10155
386
    case META_ACCEPT:
10156
10.9k
    case META_ALT:
10157
17.8k
    case META_ASTERISK:
10158
18.2k
    case META_ASTERISK_PLUS:
10159
18.6k
    case META_ASTERISK_QUERY:
10160
19.7k
    case META_BACKREF:
10161
29.9k
    case META_CIRCUMFLEX:
10162
33.0k
    case META_CLASS:
10163
33.3k
    case META_CLASS_EMPTY:
10164
33.5k
    case META_CLASS_EMPTY_NOT:
10165
39.7k
    case META_CLASS_END:
10166
42.9k
    case META_CLASS_NOT:
10167
43.1k
    case META_COMMIT:
10168
45.0k
    case META_DOLLAR:
10169
46.4k
    case META_DOT:
10170
46.7k
    case META_FAIL:
10171
55.3k
    case META_PLUS:
10172
60.0k
    case META_PLUS_PLUS:
10173
60.4k
    case META_PLUS_QUERY:
10174
60.7k
    case META_PRUNE:
10175
77.0k
    case META_QUERY:
10176
77.3k
    case META_QUERY_PLUS:
10177
78.5k
    case META_QUERY_QUERY:
10178
78.7k
    case META_RANGE_ESCAPED:
10179
79.0k
    case META_RANGE_LITERAL:
10180
79.3k
    case META_SKIP:
10181
79.8k
    case META_THEN:
10182
79.8k
    break;
10183
10184
102
    case META_OFFSET:
10185
3.33k
    case META_RECURSE:
10186
3.33k
    pptr += SIZEOFFSET;
10187
3.33k
    break;
10188
10189
306
    case META_BACKREF_BYNAME:
10190
514
    case META_RECURSE_BYNAME:
10191
514
    pptr += 1 + SIZEOFFSET;
10192
514
    break;
10193
10194
0
    case META_COND_DEFINE:
10195
0
    pptr += SIZEOFFSET;
10196
0
    nestlevel++;
10197
0
    break;
10198
10199
203
    case META_COND_NAME:
10200
718
    case META_COND_NUMBER:
10201
784
    case META_COND_RNAME:
10202
1.26k
    case META_COND_RNUMBER:
10203
1.26k
    pptr += 1 + SIZEOFFSET;
10204
1.26k
    nestlevel++;
10205
1.26k
    break;
10206
10207
0
    case META_COND_VERSION:
10208
0
    pptr += 3;
10209
0
    nestlevel++;
10210
0
    break;
10211
10212
459
    case META_CALLOUT_STRING:
10213
459
    pptr += 3 + SIZEOFFSET;
10214
459
    break;
10215
10216
0
    case META_BIGVALUE:
10217
358
    case META_POSIX:
10218
1.15k
    case META_POSIX_NEG:
10219
1.69k
    case META_CAPTURE_NAME:
10220
2.34k
    case META_CAPTURE_NUMBER:
10221
2.34k
    pptr += 1;
10222
2.34k
    break;
10223
10224
4.32k
    case META_MINMAX:
10225
4.70k
    case META_MINMAX_QUERY:
10226
5.42k
    case META_MINMAX_PLUS:
10227
5.66k
    case META_OPTIONS:
10228
5.66k
    pptr += 2;
10229
5.66k
    break;
10230
10231
129k
    case META_CALLOUT_NUMBER:
10232
129k
    pptr += 3;
10233
129k
    break;
10234
10235
375
    case META_MARK:
10236
653
    case META_COMMIT_ARG:
10237
1.01k
    case META_PRUNE_ARG:
10238
1.24k
    case META_SKIP_ARG:
10239
1.46k
    case META_THEN_ARG:
10240
1.46k
    pptr += 1 + pptr[1];
10241
1.46k
    break;
10242
10243
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10244
    the final ket of the group, so no need to update it here. */
10245
10246
6.62k
    case META_LOOKBEHIND:
10247
10.3k
    case META_LOOKBEHINDNOT:
10248
11.9k
    case META_LOOKBEHIND_NA:
10249
11.9k
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10250
569
      return errorcode;
10251
11.3k
    break;
10252
290k
    }
10253
290k
  }
10254
10255
5.16k
return 0;
10256
8.67k
}
10257
10258
10259
10260
/*************************************************
10261
*     External function to compile a pattern     *
10262
*************************************************/
10263
10264
/* This function reads a regular expression in the form of a string and returns
10265
a pointer to a block of store holding a compiled version of the expression.
10266
10267
Arguments:
10268
  pattern       the regular expression
10269
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10270
  options       option bits
10271
  errorptr      pointer to errorcode
10272
  erroroffset   pointer to error offset
10273
  ccontext      points to a compile context or is NULL
10274
10275
Returns:        pointer to compiled data block, or NULL on error,
10276
                with errorcode and erroroffset set
10277
*/
10278
10279
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10280
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10281
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10282
82.9k
{
10283
82.9k
BOOL utf;                             /* Set TRUE for UTF mode */
10284
82.9k
BOOL ucp;                             /* Set TRUE for UCP mode */
10285
82.9k
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10286
82.9k
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10287
82.9k
pcre2_real_code *re = NULL;           /* What we will return */
10288
82.9k
compile_block cb;                     /* "Static" compile-time data */
10289
82.9k
const uint8_t *tables;                /* Char tables base pointer */
10290
10291
82.9k
PCRE2_UCHAR null_str[1] = { 0xcd };   /* Dummy for handling null inputs */
10292
82.9k
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10293
82.9k
PCRE2_UCHAR *codestart;               /* Start of compiled code */
10294
82.9k
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10295
82.9k
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10296
10297
82.9k
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10298
82.9k
PCRE2_SIZE usedlength;                /* Actual length used */
10299
82.9k
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10300
82.9k
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10301
10302
82.9k
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10303
82.9k
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10304
82.9k
uint32_t setflags = 0;                /* NL and BSR set flags */
10305
82.9k
uint32_t xoptions;                    /* Flags from context, modified */
10306
10307
82.9k
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10308
82.9k
uint32_t limit_heap  = UINT32_MAX;
10309
82.9k
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10310
82.9k
uint32_t limit_depth = UINT32_MAX;
10311
10312
82.9k
int newline = 0;                      /* Unset; can be set by the pattern */
10313
82.9k
int bsr = 0;                          /* Unset; can be set by the pattern */
10314
82.9k
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10315
82.9k
int regexrc;                          /* Return from compile */
10316
10317
82.9k
uint32_t i;                           /* Local loop counter */
10318
10319
/* Enable all optimizations by default. */
10320
82.9k
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10321
82.9k
                                          PCRE2_OPTIMIZATION_ALL;
10322
10323
/* Comments at the head of this file explain about these variables. */
10324
10325
82.9k
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10326
82.9k
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10327
82.9k
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10328
10329
/* The workspace is used in different ways in the different compiling phases.
10330
It needs to be 16-bit aligned for the preliminary parsing scan. */
10331
10332
82.9k
uint32_t c16workspace[C16_WORK_SIZE];
10333
82.9k
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10334
10335
10336
/* -------------- Check arguments and set up the pattern ----------------- */
10337
10338
/* There must be error code and offset pointers. */
10339
10340
82.9k
if (errorptr == NULL)
10341
0
  {
10342
0
  if (erroroffset != NULL) *erroroffset = 0;
10343
0
  return NULL;
10344
0
  }
10345
82.9k
if (erroroffset == NULL)
10346
0
  {
10347
0
  if (errorptr != NULL) *errorptr = ERR120;
10348
0
  return NULL;
10349
0
  }
10350
82.9k
*errorptr = ERR0;
10351
82.9k
*erroroffset = 0;
10352
10353
/* There must be a pattern, but NULL is allowed with zero length. */
10354
10355
82.9k
if (pattern == NULL)
10356
0
  {
10357
0
  if (patlen == 0)
10358
0
    pattern = null_str;
10359
0
  else
10360
0
    {
10361
0
    *errorptr = ERR16;
10362
0
    return NULL;
10363
0
    }
10364
0
  }
10365
10366
/* A NULL compile context means "use a default context" */
10367
10368
82.9k
if (ccontext == NULL)
10369
0
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10370
10371
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10372
10373
82.9k
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10374
10375
/* Check that all undefined public option bits are zero. */
10376
10377
82.9k
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10378
82.9k
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10379
0
  {
10380
0
  *errorptr = ERR17;
10381
0
  return NULL;
10382
0
  }
10383
10384
82.9k
if ((options & PCRE2_LITERAL) != 0 &&
10385
0
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10386
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10387
0
  {
10388
0
  *errorptr = ERR92;
10389
0
  return NULL;
10390
0
  }
10391
10392
/* A zero-terminated pattern is indicated by the special length value
10393
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10394
10395
82.9k
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10396
0
  patlen = PRIV(strlen)(pattern);
10397
82.9k
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10398
10399
82.9k
if (patlen > ccontext->max_pattern_length)
10400
0
  {
10401
0
  *errorptr = ERR88;
10402
0
  return NULL;
10403
0
  }
10404
10405
/* Optimization flags in 'options' can override those in the compile context.
10406
This is because some options to disable optimizations were added before the
10407
optimization flags word existed, and we need to continue supporting them
10408
for backwards compatibility. */
10409
10410
82.9k
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10411
14.6k
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10412
82.9k
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10413
10.2k
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10414
82.9k
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10415
17.4k
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10416
10417
/* From here on, all returns from this function should end up going via the
10418
EXIT label. */
10419
10420
10421
/* ------------ Initialize the "static" compile data -------------- */
10422
10423
82.9k
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10424
10425
82.9k
cb.lcc = tables + lcc_offset;          /* Individual */
10426
82.9k
cb.fcc = tables + fcc_offset;          /*   character */
10427
82.9k
cb.cbits = tables + cbits_offset;      /*      tables */
10428
82.9k
cb.ctypes = tables + ctypes_offset;
10429
10430
82.9k
cb.assert_depth = 0;
10431
82.9k
cb.bracount = 0;
10432
82.9k
cb.cx = ccontext;
10433
82.9k
cb.dupnames = FALSE;
10434
82.9k
cb.end_pattern = pattern + patlen;
10435
82.9k
cb.erroroffset = 0;
10436
82.9k
cb.external_flags = 0;
10437
82.9k
cb.external_options = options;
10438
82.9k
cb.groupinfo = stack_groupinfo;
10439
82.9k
cb.had_recurse = FALSE;
10440
82.9k
cb.lastcapture = 0;
10441
82.9k
cb.max_lookbehind = 0;                               /* Max encountered */
10442
82.9k
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10443
82.9k
cb.name_entry_size = 0;
10444
82.9k
cb.name_table = NULL;
10445
82.9k
cb.named_groups = named_groups;
10446
82.9k
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10447
82.9k
cb.names_found = 0;
10448
82.9k
cb.parens_depth = 0;
10449
82.9k
cb.parsed_pattern = stack_parsed_pattern;
10450
82.9k
cb.req_varyopt = 0;
10451
82.9k
cb.start_code = cworkspace;
10452
82.9k
cb.start_pattern = pattern;
10453
82.9k
cb.start_workspace = cworkspace;
10454
82.9k
cb.workspace_size = COMPILE_WORK_SIZE;
10455
82.9k
cb.first_data = NULL;
10456
82.9k
cb.last_data = NULL;
10457
82.9k
#ifdef SUPPORT_WIDE_CHARS
10458
82.9k
cb.char_lists_size = 0;
10459
82.9k
#endif
10460
10461
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10462
references to help in deciding whether (.*) can be treated as anchored or not.
10463
*/
10464
10465
82.9k
cb.top_backref = 0;
10466
82.9k
cb.backref_map = 0;
10467
10468
/* Escape sequences \1 to \9 are always back references, but as they are only
10469
two characters long, only two elements can be used in the parsed_pattern
10470
vector. The first contains the reference, and we'd like to use the second to
10471
record the offset in the pattern, so that forward references to non-existent
10472
groups can be diagnosed later with an offset. However, on 64-bit systems,
10473
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10474
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10475
references have enough space for the offset to be put into the parsed pattern.
10476
*/
10477
10478
912k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10479
10480
10481
/* --------------- Start looking at the pattern --------------- */
10482
10483
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10484
the start of the pattern, and remember the offset to the actual regex. With
10485
valgrind support, make the terminator of a zero-terminated pattern
10486
inaccessible. This catches bugs that would otherwise only show up for
10487
non-zero-terminated patterns. */
10488
10489
#ifdef SUPPORT_VALGRIND
10490
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10491
#endif
10492
10493
82.9k
xoptions = ccontext->extra_options;
10494
82.9k
ptr = pattern;
10495
82.9k
skipatstart = 0;
10496
10497
82.9k
if ((options & PCRE2_LITERAL) == 0)
10498
82.9k
  {
10499
83.0k
  while (patlen - skipatstart >= 2 &&
10500
82.3k
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10501
23.2k
         ptr[skipatstart+1] == CHAR_ASTERISK)
10502
2.15k
    {
10503
51.2k
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10504
49.1k
      {
10505
49.1k
      const pso *p = pso_list + i;
10506
10507
49.1k
      if (patlen - skipatstart - 2 >= p->length &&
10508
35.9k
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10509
67
        {
10510
67
        uint32_t c, pp;
10511
10512
67
        skipatstart += p->length + 2;
10513
67
        switch(p->type)
10514
67
          {
10515
0
          case PSO_OPT:
10516
0
          cb.external_options |= p->value;
10517
0
          break;
10518
10519
0
          case PSO_XOPT:
10520
0
          xoptions |= p->value;
10521
0
          break;
10522
10523
0
          case PSO_FLG:
10524
0
          setflags |= p->value;
10525
0
          break;
10526
10527
67
          case PSO_NL:
10528
67
          newline = p->value;
10529
67
          setflags |= PCRE2_NL_SET;
10530
67
          break;
10531
10532
0
          case PSO_BSR:
10533
0
          bsr = p->value;
10534
0
          setflags |= PCRE2_BSR_SET;
10535
0
          break;
10536
10537
0
          case PSO_LIMM:
10538
0
          case PSO_LIMD:
10539
0
          case PSO_LIMH:
10540
0
          c = 0;
10541
0
          pp = skipatstart;
10542
0
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10543
0
            {
10544
0
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10545
0
            c = c*10 + (ptr[pp++] - CHAR_0);
10546
0
            }
10547
0
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10548
0
            {
10549
0
            errorcode = ERR60;
10550
0
            ptr += pp;
10551
0
            utf = FALSE;  /* Used by HAD_EARLY_ERROR */
10552
0
            goto HAD_EARLY_ERROR;
10553
0
            }
10554
0
          if (p->type == PSO_LIMH) limit_heap = c;
10555
0
            else if (p->type == PSO_LIMM) limit_match = c;
10556
0
            else limit_depth = c;
10557
0
          skipatstart = ++pp;
10558
0
          break;
10559
10560
0
          case PSO_OPTMZ:
10561
0
          optim_flags &= ~(p->value);
10562
10563
          /* For backward compatibility the three original VERBs to disable
10564
          optimizations need to also update the corresponding bit in the
10565
          external options. */
10566
10567
0
          switch(p->value)
10568
0
            {
10569
0
            case PCRE2_OPTIM_AUTO_POSSESS:
10570
0
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10571
0
            break;
10572
10573
0
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10574
0
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10575
0
            break;
10576
10577
0
            case PCRE2_OPTIM_START_OPTIMIZE:
10578
0
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10579
0
            break;
10580
0
            }
10581
10582
0
          break;
10583
10584
          /* LCOV_EXCL_START */
10585
0
          default:
10586
          /* All values in the enum need an explicit entry for this switch
10587
          but until a better way to prevent coding mistakes is invented keep
10588
          a catch all that triggers a debug build assert as a failsafe */
10589
0
          PCRE2_DEBUG_UNREACHABLE();
10590
          /* LCOV_EXCL_STOP */
10591
67
          }
10592
67
        break;   /* Out of the table scan loop */
10593
67
        }
10594
49.1k
      }
10595
2.15k
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10596
2.15k
    }
10597
82.9k
    PCRE2_ASSERT(skipatstart <= patlen);
10598
82.9k
  }
10599
10600
/* End of pattern-start options; advance to start of real regex. */
10601
10602
82.9k
ptr += skipatstart;
10603
10604
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10605
10606
#ifndef SUPPORT_UNICODE
10607
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10608
  {
10609
  errorcode = ERR32;
10610
  goto HAD_EARLY_ERROR;
10611
  }
10612
#endif
10613
10614
/* Check UTF. We have the original options in 'options', with that value as
10615
modified by (*UTF) etc in cb->external_options. The extra option
10616
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10617
surrogate code points cannot be represented in UTF-16. */
10618
10619
82.9k
utf = (cb.external_options & PCRE2_UTF) != 0;
10620
82.9k
if (utf)
10621
23.7k
  {
10622
23.7k
  if ((options & PCRE2_NEVER_UTF) != 0)
10623
0
    {
10624
0
    errorcode = ERR74;
10625
0
    goto HAD_EARLY_ERROR;
10626
0
    }
10627
23.7k
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10628
23.7k
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10629
766
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10630
10631
23.0k
#if PCRE2_CODE_UNIT_WIDTH == 16
10632
23.0k
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10633
0
    {
10634
0
    errorcode = ERR91;
10635
0
    goto HAD_EARLY_ERROR;
10636
0
    }
10637
23.0k
#endif
10638
23.0k
  }
10639
10640
/* Check UCP lockout. */
10641
10642
82.1k
ucp = (cb.external_options & PCRE2_UCP) != 0;
10643
82.1k
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10644
0
  {
10645
0
  errorcode = ERR75;
10646
0
  goto HAD_EARLY_ERROR;
10647
0
  }
10648
10649
/* PCRE2_EXTRA_TURKISH_CASING checks */
10650
10651
82.1k
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10652
0
  {
10653
0
  if (!utf && !ucp)
10654
0
    {
10655
0
    errorcode = ERR104;
10656
0
    goto HAD_EARLY_ERROR;
10657
0
    }
10658
10659
#if PCRE2_CODE_UNIT_WIDTH == 8
10660
  if (!utf)
10661
    {
10662
    errorcode = ERR105;
10663
    goto HAD_EARLY_ERROR;
10664
    }
10665
#endif
10666
10667
0
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10668
0
    {
10669
0
    errorcode = ERR106;
10670
0
    goto HAD_EARLY_ERROR;
10671
0
    }
10672
0
  }
10673
10674
/* Process the BSR setting. */
10675
10676
82.1k
if (bsr == 0) bsr = ccontext->bsr_convention;
10677
10678
/* Process the newline setting. */
10679
10680
82.1k
if (newline == 0) newline = ccontext->newline_convention;
10681
82.1k
cb.nltype = NLTYPE_FIXED;
10682
82.1k
switch(newline)
10683
82.1k
  {
10684
28
  case PCRE2_NEWLINE_CR:
10685
28
  cb.nllen = 1;
10686
28
  cb.nl[0] = CHAR_CR;
10687
28
  break;
10688
10689
82.1k
  case PCRE2_NEWLINE_LF:
10690
82.1k
  cb.nllen = 1;
10691
82.1k
  cb.nl[0] = CHAR_NL;
10692
82.1k
  break;
10693
10694
0
  case PCRE2_NEWLINE_NUL:
10695
0
  cb.nllen = 1;
10696
0
  cb.nl[0] = CHAR_NUL;
10697
0
  break;
10698
10699
0
  case PCRE2_NEWLINE_CRLF:
10700
0
  cb.nllen = 2;
10701
0
  cb.nl[0] = CHAR_CR;
10702
0
  cb.nl[1] = CHAR_NL;
10703
0
  break;
10704
10705
0
  case PCRE2_NEWLINE_ANY:
10706
0
  cb.nltype = NLTYPE_ANY;
10707
0
  break;
10708
10709
0
  case PCRE2_NEWLINE_ANYCRLF:
10710
0
  cb.nltype = NLTYPE_ANYCRLF;
10711
0
  break;
10712
10713
  /* LCOV_EXCL_START */
10714
0
  default:
10715
0
  PCRE2_DEBUG_UNREACHABLE();
10716
0
  errorcode = ERR56;
10717
0
  goto HAD_EARLY_ERROR;
10718
  /* LCOV_EXCL_STOP */
10719
82.1k
  }
10720
10721
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10722
their numerical equivalents, so that this information is always available for
10723
the remaining processing. (2) At the same time, parse the pattern and put a
10724
processed version into the parsed_pattern vector. This has escapes interpreted
10725
and comments removed (amongst other things). */
10726
10727
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10728
patterns the vector on the stack (which was set up above) can be used. */
10729
10730
82.1k
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10731
10732
/* Allow for 2x uint32_t at the start and 2 at the end, for
10733
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10734
10735
82.1k
if ((ccontext->extra_options &
10736
82.1k
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10737
0
  parsed_size_needed += 4;
10738
10739
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10740
10741
82.1k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10742
15.4k
  parsed_size_needed += 4;
10743
10744
82.1k
parsed_size_needed += 1;  /* For the final META_END */
10745
10746
82.1k
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10747
2.22k
  {
10748
2.22k
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10749
2.22k
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10750
2.22k
  if (heap_parsed_pattern == NULL)
10751
0
    {
10752
0
    *errorptr = ERR21;
10753
0
    goto EXIT;
10754
0
    }
10755
2.22k
  cb.parsed_pattern = heap_parsed_pattern;
10756
2.22k
  }
10757
82.1k
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10758
10759
/* Do the parsing scan. */
10760
10761
82.1k
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10762
82.1k
if (errorcode != 0) goto HAD_CB_ERROR;
10763
10764
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10765
lengths. Workspace is needed to remember whether numbered groups are or are not
10766
of limited length, and if limited, what the minimum and maximum lengths are.
10767
This caching saves re-computing the length of any group that is referenced more
10768
than once, which is particularly relevant when recursion is involved.
10769
Unnumbered groups do not have this exposure because they cannot be referenced.
10770
If there are sufficiently few groups, the default index vector on the stack, as
10771
set up above, can be used. Otherwise we have to get/free some heap memory. The
10772
vector must be initialized to zero. */
10773
10774
75.1k
if (has_lookbehind)
10775
5.60k
  {
10776
5.60k
  int loopcount = 0;
10777
5.60k
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10778
24
    {
10779
24
    cb.groupinfo = ccontext->memctl.malloc(
10780
24
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10781
24
    if (cb.groupinfo == NULL)
10782
0
      {
10783
0
      errorcode = ERR21;
10784
0
      cb.erroroffset = 0;
10785
0
      goto HAD_CB_ERROR;
10786
0
      }
10787
24
    }
10788
5.60k
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10789
5.60k
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10790
5.60k
  if (errorcode != 0) goto HAD_CB_ERROR;
10791
5.60k
  }
10792
10793
/* For debugging, there is a function that shows the parsed pattern vector. */
10794
10795
#ifdef DEBUG_SHOW_PARSED
10796
fprintf(stderr, "+++ Pre-scan complete:\n");
10797
show_parsed(&cb);
10798
#endif
10799
10800
/* For debugging capturing information this code can be enabled. */
10801
10802
#ifdef DEBUG_SHOW_CAPTURES
10803
  {
10804
  named_group *ng = cb.named_groups;
10805
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10806
  for (i = 0; i < cb.names_found; i++, ng++)
10807
    {
10808
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10809
    }
10810
  }
10811
#endif
10812
10813
/* Pretend to compile the pattern while actually just accumulating the amount
10814
of memory required in the 'length' variable. This behaviour is triggered by
10815
passing a non-NULL final argument to compile_regex(). We pass a block of
10816
workspace (cworkspace) for it to compile parts of the pattern into; the
10817
compiled code is discarded when it is no longer needed, so hopefully this
10818
workspace will never overflow, though there is a test for its doing so.
10819
10820
On error, errorcode will be set non-zero, so we don't need to look at the
10821
result of the function. The initial options have been put into the cb block,
10822
but we still have to pass a separate options variable (the first argument)
10823
because the options may change as the pattern is processed. */
10824
10825
74.6k
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10826
74.6k
pptr = cb.parsed_pattern;
10827
74.6k
code = cworkspace;
10828
74.6k
*code = OP_BRA;
10829
10830
74.6k
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10831
74.6k
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10832
74.6k
   &cb, &length);
10833
10834
74.6k
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10835
10836
/* This should be caught in compile_regex(), but just in case... */
10837
10838
72.9k
#if defined SUPPORT_WIDE_CHARS
10839
72.9k
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10840
72.9k
if (length > MAX_PATTERN_SIZE ||
10841
72.9k
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10842
#else
10843
if (length > MAX_PATTERN_SIZE)
10844
#endif
10845
11
  {
10846
11
  errorcode = ERR20;
10847
11
  cb.erroroffset = 0;
10848
11
  goto HAD_CB_ERROR;
10849
11
  }
10850
10851
/* Compute the size of, then, if not too large, get and initialize the data
10852
block for storing the compiled pattern and names table. Integer overflow should
10853
no longer be possible because nowadays we limit the maximum value of
10854
cb.names_found and cb.name_entry_size. */
10855
10856
72.9k
re_blocksize =
10857
72.9k
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10858
10859
72.9k
#if defined SUPPORT_WIDE_CHARS
10860
72.9k
if (cb.char_lists_size != 0)
10861
2.44k
  {
10862
2.44k
#if PCRE2_CODE_UNIT_WIDTH != 32
10863
  /* Align to 32 bit first. This ensures the
10864
  allocated area will also be 32 bit aligned. */
10865
2.44k
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10866
2.44k
#endif
10867
2.44k
  re_blocksize += cb.char_lists_size;
10868
2.44k
  }
10869
72.9k
#endif
10870
10871
72.9k
re_blocksize += CU2BYTES(length);
10872
10873
72.9k
if (re_blocksize > ccontext->max_pattern_compiled_length)
10874
0
  {
10875
0
  errorcode = ERR101;
10876
0
  cb.erroroffset = 0;
10877
0
  goto HAD_CB_ERROR;
10878
0
  }
10879
10880
72.9k
re_blocksize += sizeof(pcre2_real_code);
10881
72.9k
re = (pcre2_real_code *)
10882
72.9k
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10883
72.9k
if (re == NULL)
10884
0
  {
10885
0
  errorcode = ERR21;
10886
0
  cb.erroroffset = 0;
10887
0
  goto HAD_CB_ERROR;
10888
0
  }
10889
10890
/* The compiler may put padding at the end of the pcre2_real_code structure in
10891
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10892
compiled pattern is copied (for example, when serialized) undefined bytes are
10893
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10894
write to the last 8 bytes of the structure before setting the fields. */
10895
10896
72.9k
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10897
72.9k
re->memctl = ccontext->memctl;
10898
72.9k
re->tables = tables;
10899
72.9k
re->executable_jit = NULL;
10900
72.9k
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10901
72.9k
re->blocksize = re_blocksize;
10902
72.9k
re->code_start = re_blocksize - CU2BYTES(length);
10903
72.9k
re->magic_number = MAGIC_NUMBER;
10904
72.9k
re->compile_options = options;
10905
72.9k
re->overall_options = cb.external_options;
10906
72.9k
re->extra_options = xoptions;
10907
72.9k
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10908
72.9k
re->limit_heap = limit_heap;
10909
72.9k
re->limit_match = limit_match;
10910
72.9k
re->limit_depth = limit_depth;
10911
72.9k
re->first_codeunit = 0;
10912
72.9k
re->last_codeunit = 0;
10913
72.9k
re->bsr_convention = bsr;
10914
72.9k
re->newline_convention = newline;
10915
72.9k
re->max_lookbehind = 0;
10916
72.9k
re->minlength = 0;
10917
72.9k
re->top_bracket = 0;
10918
72.9k
re->top_backref = 0;
10919
72.9k
re->name_entry_size = cb.name_entry_size;
10920
72.9k
re->name_count = cb.names_found;
10921
72.9k
re->optimization_flags = optim_flags;
10922
10923
/* The basic block is immediately followed by the name table, and the compiled
10924
code follows after that. */
10925
10926
72.9k
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10927
10928
/* Update the compile data block for the actual compile. The starting points of
10929
the name/number translation table and of the code are passed around in the
10930
compile data block. The start/end pattern and initial options are already set
10931
from the pre-compile phase, as is the name_entry_size field. */
10932
10933
72.9k
cb.parens_depth = 0;
10934
72.9k
cb.assert_depth = 0;
10935
72.9k
cb.lastcapture = 0;
10936
72.9k
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10937
72.9k
cb.start_code = codestart;
10938
72.9k
cb.req_varyopt = 0;
10939
72.9k
cb.had_accept = FALSE;
10940
72.9k
cb.had_pruneorskip = FALSE;
10941
72.9k
#ifdef SUPPORT_WIDE_CHARS
10942
72.9k
cb.char_lists_size = 0;
10943
72.9k
#endif
10944
10945
10946
/* If any named groups were found, create the name/number table from the list
10947
created in the pre-pass. */
10948
10949
72.9k
if (cb.names_found > 0)
10950
1.27k
  {
10951
1.27k
  named_group *ng = cb.named_groups;
10952
1.27k
  uint32_t tablecount = 0;
10953
10954
  /* Length 0 represents duplicates, and they have already been handled. */
10955
7.57k
  for (i = 0; i < cb.names_found; i++, ng++)
10956
6.29k
    if (ng->length > 0)
10957
1.67k
      tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);
10958
10959
1.27k
  PCRE2_ASSERT(tablecount == cb.names_found);
10960
1.27k
  }
10961
10962
/* Set up a starting, non-extracting bracket, then compile the expression. On
10963
error, errorcode will be set non-zero, so we don't need to look at the result
10964
of the function here. */
10965
10966
72.9k
pptr = cb.parsed_pattern;
10967
72.9k
code = (PCRE2_UCHAR *)codestart;
10968
72.9k
*code = OP_BRA;
10969
72.9k
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10970
72.9k
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10971
72.9k
  NULL, &cb, NULL);
10972
72.9k
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10973
72.9k
re->top_bracket = cb.bracount;
10974
72.9k
re->top_backref = cb.top_backref;
10975
72.9k
re->max_lookbehind = cb.max_lookbehind;
10976
10977
72.9k
if (cb.had_accept)
10978
4.55k
  {
10979
4.55k
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10980
4.55k
  reqcuflags = REQ_NONE;
10981
4.55k
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10982
4.55k
  }
10983
10984
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10985
but the estimated length exceeds the really used length, adjust the value of
10986
re->blocksize, and if valgrind support is configured, mark the extra allocated
10987
memory as unaddressable, so that any out-of-bound reads can be detected. */
10988
10989
72.9k
*code++ = OP_END;
10990
72.9k
usedlength = code - codestart;
10991
/* LCOV_EXCL_START */
10992
72.9k
if (usedlength > length)
10993
0
  {
10994
0
  PCRE2_DEBUG_UNREACHABLE();
10995
0
  errorcode = ERR23;  /* Overflow of code block - internal error */
10996
0
  cb.erroroffset = 0;
10997
0
  goto HAD_CB_ERROR;
10998
0
  }
10999
/* LCOV_EXCL_STOP */
11000
11001
72.9k
re->blocksize -= CU2BYTES(length - usedlength);
11002
#ifdef SUPPORT_VALGRIND
11003
VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
11004
#endif
11005
11006
/* Scan the pattern for recursion/subroutine calls and convert the group
11007
numbers into offsets. Maintain a small cache so that repeated groups containing
11008
recursions are efficiently handled. */
11009
11010
72.9k
#define RSCAN_CACHE_SIZE 8
11011
11012
72.9k
if (errorcode == 0 && cb.had_recurse)
11013
7.33k
  {
11014
7.33k
  PCRE2_UCHAR *rcode;
11015
7.33k
  PCRE2_SPTR rgroup;
11016
7.33k
  unsigned int ccount = 0;
11017
7.33k
  int start = RSCAN_CACHE_SIZE;
11018
7.33k
  recurse_cache rc[RSCAN_CACHE_SIZE];
11019
11020
7.33k
  for (rcode = find_recurse(codestart, utf);
11021
561k
       rcode != NULL;
11022
554k
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
11023
554k
    {
11024
554k
    int p, groupnumber;
11025
11026
554k
    groupnumber = (int)GET(rcode, 1);
11027
554k
    if (groupnumber == 0) rgroup = codestart; else
11028
48.4k
      {
11029
48.4k
      PCRE2_SPTR search_from = codestart;
11030
48.4k
      rgroup = NULL;
11031
76.6k
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
11032
72.1k
        {
11033
72.1k
        if (groupnumber == rc[p].groupnumber)
11034
43.9k
          {
11035
43.9k
          rgroup = rc[p].group;
11036
43.9k
          break;
11037
43.9k
          }
11038
11039
        /* Group n+1 must always start to the right of group n, so we can save
11040
        search time below when the new group number is greater than any of the
11041
        previously found groups. */
11042
11043
28.1k
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
11044
28.1k
        }
11045
11046
48.4k
      if (rgroup == NULL)
11047
4.52k
        {
11048
4.52k
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
11049
        /* LCOV_EXCL_START */
11050
4.52k
        if (rgroup == NULL)
11051
0
          {
11052
0
          PCRE2_DEBUG_UNREACHABLE();
11053
0
          errorcode = ERR53;
11054
0
          break;
11055
0
          }
11056
        /* LCOV_EXCL_STOP */
11057
11058
4.52k
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
11059
4.52k
        rc[start].groupnumber = groupnumber;
11060
4.52k
        rc[start].group = rgroup;
11061
4.52k
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
11062
4.52k
        }
11063
48.4k
      }
11064
11065
554k
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
11066
554k
    }
11067
7.33k
  }
11068
11069
/* In rare debugging situations we sometimes need to look at the compiled code
11070
at this stage. */
11071
11072
#ifdef DEBUG_CALL_PRINTINT
11073
pcre2_printint(re, stderr, TRUE);
11074
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
11075
#endif
11076
11077
/* Unless disabled, check whether any single character iterators can be
11078
auto-possessified. The function overwrites the appropriate opcode values, so
11079
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
11080
used in this code because at least one compiler gives a warning about loss of
11081
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
11082
function call. */
11083
11084
72.9k
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
11085
60.1k
  {
11086
60.1k
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
11087
60.1k
  int possessify_rc = PRIV(auto_possessify)(temp, &cb);
11088
  /* LCOV_EXCL_START */
11089
60.1k
  if (possessify_rc != 0)
11090
0
    {
11091
0
    PCRE2_DEBUG_UNREACHABLE();
11092
0
    errorcode = ERR80;
11093
0
    cb.erroroffset = 0;
11094
0
    }
11095
  /* LCOV_EXCL_STOP */
11096
60.1k
  }
11097
11098
/* Failed to compile, or error while post-processing. */
11099
11100
72.9k
if (errorcode != 0) goto HAD_CB_ERROR;
11101
11102
/* Successful compile. If the anchored option was not passed, set it if
11103
we can determine that the pattern is anchored by virtue of ^ characters or \A
11104
or anything else, such as starting with non-atomic .* when DOTALL is set and
11105
there are no occurrences of *PRUNE or *SKIP (though there is an option to
11106
disable this case). */
11107
11108
72.8k
if ((re->overall_options & PCRE2_ANCHORED) == 0)
11109
68.1k
  {
11110
68.1k
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11111
68.1k
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11112
441
    re->overall_options |= PCRE2_ANCHORED;
11113
68.1k
  }
11114
11115
/* Set up the first code unit or startline flag, the required code unit, and
11116
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
11117
is disabled, as the data it would create will not be used. Note that a first code
11118
unit (but not the startline flag) is useful for anchored patterns because it
11119
can still give a quick "no match" and also avoid searching for a last code
11120
unit. */
11121
11122
72.8k
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
11123
57.4k
  {
11124
57.4k
  int minminlength = 0;  /* For minimal minlength from first/required CU */
11125
57.4k
  int study_rc;
11126
11127
  /* If we do not have a first code unit, see if there is one that is asserted
11128
  (these are not saved during the compile because they can cause conflicts with
11129
  actual literals that follow). */
11130
11131
57.4k
  if (firstcuflags >= REQ_NONE) {
11132
39.6k
    uint32_t assertedcuflags = 0;
11133
39.6k
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
11134
    /* It would be wrong to use the asserted first code unit as `firstcu` for
11135
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
11136
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
11137
     * the subject string needs to be at least 2 characters long, which is wrong.
11138
     * With more analysis, we would be able to set firstcu in more cases. */
11139
39.6k
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
11140
1.14k
      firstcu = assertedcu;
11141
1.14k
      firstcuflags = assertedcuflags;
11142
1.14k
    }
11143
39.6k
  }
11144
11145
  /* Save the data for a first code unit. The existence of one means the
11146
  minimum length must be at least 1. */
11147
11148
57.4k
  if (firstcuflags < REQ_NONE)
11149
18.9k
    {
11150
18.9k
    re->first_codeunit = firstcu;
11151
18.9k
    re->flags |= PCRE2_FIRSTSET;
11152
18.9k
    minminlength++;
11153
11154
    /* Handle caseless first code units. */
11155
11156
18.9k
    if ((firstcuflags & REQ_CASELESS) != 0)
11157
4.15k
      {
11158
4.15k
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
11159
1.52k
        {
11160
1.52k
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
11161
1.52k
        }
11162
11163
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
11164
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
11165
      units and cannot have another case, but if UCP is set they may do. */
11166
11167
2.63k
#ifdef SUPPORT_UNICODE
11168
#if PCRE2_CODE_UNIT_WIDTH == 8
11169
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
11170
        re->flags |= PCRE2_FIRSTCASELESS;
11171
#else
11172
2.63k
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
11173
1.75k
               UCD_OTHERCASE(firstcu) != firstcu)
11174
332
        re->flags |= PCRE2_FIRSTCASELESS;
11175
4.15k
#endif
11176
4.15k
#endif  /* SUPPORT_UNICODE */
11177
4.15k
      }
11178
18.9k
    }
11179
11180
  /* When there is no first code unit, for non-anchored patterns, see if we can
11181
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
11182
  branches start with ^ and also when all branches start with non-atomic .* for
11183
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
11184
  that disables this case.) */
11185
11186
38.4k
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
11187
36.9k
    {
11188
36.9k
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11189
36.9k
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11190
521
      re->flags |= PCRE2_STARTLINE;
11191
36.9k
    }
11192
11193
  /* Handle the "required code unit", if one is set. In the UTF case we can
11194
  increment the minimum minimum length only if we are sure this really is a
11195
  different character and not a non-starting code unit of the first character,
11196
  because the minimum length count is in characters, not code units. */
11197
11198
57.4k
  if (reqcuflags < REQ_NONE)
11199
32.5k
    {
11200
32.5k
#if PCRE2_CODE_UNIT_WIDTH == 16
11201
32.5k
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11202
5.38k
        firstcuflags >= REQ_NONE ||                 /* First not set */
11203
2.32k
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
11204
23
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
11205
#elif PCRE2_CODE_UNIT_WIDTH == 8
11206
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11207
        firstcuflags >= REQ_NONE ||                 /* First not set */
11208
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
11209
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
11210
#endif
11211
32.5k
      {
11212
32.5k
      minminlength++;
11213
32.5k
      }
11214
11215
    /* In the case of an anchored pattern, set up the value only if it follows
11216
    a variable length item in the pattern. */
11217
11218
32.5k
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
11219
1.53k
        (reqcuflags & REQ_VARY) != 0)
11220
31.9k
      {
11221
31.9k
      re->last_codeunit = reqcu;
11222
31.9k
      re->flags |= PCRE2_LASTSET;
11223
11224
      /* Handle caseless required code units as for first code units (above). */
11225
11226
31.9k
      if ((reqcuflags & REQ_CASELESS) != 0)
11227
6.40k
        {
11228
6.40k
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
11229
2.52k
          {
11230
2.52k
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11231
2.52k
          }
11232
3.87k
#ifdef SUPPORT_UNICODE
11233
#if PCRE2_CODE_UNIT_WIDTH == 8
11234
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11235
        re->flags |= PCRE2_LASTCASELESS;
11236
#else
11237
3.87k
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11238
2.68k
               UCD_OTHERCASE(reqcu) != reqcu)
11239
584
        re->flags |= PCRE2_LASTCASELESS;
11240
6.40k
#endif
11241
6.40k
#endif  /* SUPPORT_UNICODE */
11242
6.40k
        }
11243
31.9k
      }
11244
32.5k
    }
11245
11246
  /* Study the compiled pattern to set up information such as a bitmap of
11247
  starting code units and a minimum matching length. */
11248
11249
57.4k
  study_rc = PRIV(study)(re);
11250
  /* LCOV_EXCL_START */
11251
57.4k
  if (study_rc != 0)
11252
0
    {
11253
0
    PCRE2_DEBUG_UNREACHABLE();
11254
0
    errorcode = ERR31;
11255
0
    cb.erroroffset = 0;
11256
0
    goto HAD_CB_ERROR;
11257
0
    }
11258
  /* LCOV_EXCL_STOP */
11259
11260
  /* If study() set a bitmap of starting code units, it implies a minimum
11261
  length of at least one. */
11262
11263
57.4k
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11264
5.43k
    minminlength = 1;
11265
11266
  /* If the minimum length set (or not set) by study() is less than the minimum
11267
  implied by required code units, override it. */
11268
11269
57.4k
  if (re->minlength < minminlength) re->minlength = minminlength;
11270
57.4k
  }   /* End of start-of-match optimizations. */
11271
11272
/* Control ends up here in all cases. When running under valgrind, make a
11273
pattern's terminating zero defined again. If memory was obtained for the parsed
11274
version of the pattern, free it before returning. Also free the list of named
11275
groups if a larger one had to be obtained, and likewise the group information
11276
vector. */
11277
11278
72.8k
#ifdef SUPPORT_UNICODE
11279
/* All items must be freed. */
11280
72.8k
PCRE2_ASSERT(cb.first_data == NULL);
11281
72.8k
#endif
11282
11283
82.9k
EXIT:
11284
#ifdef SUPPORT_VALGRIND
11285
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11286
#endif
11287
82.9k
if (cb.parsed_pattern != stack_parsed_pattern)
11288
2.22k
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11289
82.9k
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11290
75
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11291
82.9k
if (cb.groupinfo != stack_groupinfo)
11292
24
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11293
11294
82.9k
return re;    /* Will be NULL after an error */
11295
11296
/* Errors discovered in parse_regex() set the offset value in the compile
11297
block. Errors discovered before it is called must compute it from the ptr
11298
value. After parse_regex() is called, the offset in the compile block is set to
11299
the end of the pattern, but certain errors in compile_regex() may reset it if
11300
an offset is available in the parsed pattern. */
11301
11302
9.29k
HAD_CB_ERROR:
11303
9.29k
ptr = pattern + cb.erroroffset;
11304
11305
9.29k
HAD_EARLY_ERROR:
11306
/* Ensure we don't return out-of-range erroroffset. */
11307
9.29k
PCRE2_ASSERT(ptr >= pattern);
11308
9.29k
PCRE2_ASSERT(ptr <= (pattern + patlen));
11309
/* Ensure that the erroroffset never slices a UTF-encoded character in half.
11310
If the input is invalid, then we return an offset just before the first invalid
11311
character, so the text to the left of the offset must always be valid. */
11312
#if defined PCRE2_DEBUG && defined SUPPORT_UNICODE
11313
if (ptr > pattern && utf)
11314
  {
11315
  PCRE2_SPTR prev = ptr - 1;
11316
  PCRE2_SIZE dummyoffset;
11317
  BACKCHAR(prev);
11318
  PCRE2_ASSERT(prev >= pattern);
11319
  PCRE2_ASSERT(PRIV(valid_utf)(prev, ptr - prev, &dummyoffset) == 0);
11320
  }
11321
#endif
11322
9.29k
*erroroffset = ptr - pattern;
11323
11324
10.0k
HAD_ERROR:
11325
10.0k
*errorptr = errorcode;
11326
10.0k
pcre2_code_free(re);
11327
10.0k
re = NULL;
11328
11329
10.0k
if (cb.first_data != NULL)
11330
462
  {
11331
462
  compile_data* current_data = cb.first_data;
11332
462
  do
11333
8.49k
    {
11334
8.49k
    compile_data* next_data = current_data->next;
11335
8.49k
    cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);
11336
8.49k
    current_data = next_data;
11337
8.49k
    }
11338
8.49k
  while (current_data != NULL);
11339
462
  }
11340
11341
10.0k
goto EXIT;
11342
9.29k
}
11343
11344
/* These #undefs are here to enable unity builds with CMake. */
11345
11346
#undef NLBLOCK /* Block containing newline information */
11347
#undef PSSTART /* Field containing processed string start */
11348
#undef PSEND   /* Field containing processed string end */
11349
11350
/* End of pcre2_compile.c */