Coverage Report

Created: 2026-06-10 06:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libgit2/deps/pcre2/pcre2_compile.c
Line
Count
Source
1
/*************************************************
2
*      Perl-Compatible Regular Expressions       *
3
*************************************************/
4
5
/* PCRE is a library of functions to support regular expressions whose syntax
6
and semantics are as close as possible to those of the Perl 5 language.
7
8
                       Written by Philip Hazel
9
     Original API code Copyright (c) 1997-2012 University of Cambridge
10
          New API code Copyright (c) 2016-2024 University of Cambridge
11
12
-----------------------------------------------------------------------------
13
Redistribution and use in source and binary forms, with or without
14
modification, are permitted provided that the following conditions are met:
15
16
    * Redistributions of source code must retain the above copyright notice,
17
      this list of conditions and the following disclaimer.
18
19
    * Redistributions in binary form must reproduce the above copyright
20
      notice, this list of conditions and the following disclaimer in the
21
      documentation and/or other materials provided with the distribution.
22
23
    * Neither the name of the University of Cambridge nor the names of its
24
      contributors may be used to endorse or promote products derived from
25
      this software without specific prior written permission.
26
27
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37
POSSIBILITY OF SUCH DAMAGE.
38
-----------------------------------------------------------------------------
39
*/
40
41
42
#include "pcre2_compile.h"
43
44
45
46
21.5k
#define NLBLOCK cb             /* Block containing newline information */
47
#define PSSTART start_pattern  /* Field containing processed string start */
48
7.67k
#define PSEND   end_pattern    /* Field containing processed string end */
49
50
/* In rare error cases debugging might require calling pcre2_printint(). */
51
52
#if 0
53
#ifdef EBCDIC
54
#define PRINTABLE(c) ((c) >= 64 && (c) < 255)
55
#else
56
#define PRINTABLE(c) ((c) >= 32 && (c) < 127)
57
#endif
58
#define CHAR_OUTPUT(c)      (c)
59
#define CHAR_OUTPUT_HEX(c)  (c)
60
#define CHAR_INPUT(c)       (c)
61
#define CHAR_INPUT_HEX(c)   (c)
62
#include "pcre2_printint_inc.h"
63
#undef PRINTABLE
64
#undef CHAR_OUTPUT
65
#undef CHAR_OUTPUT_HEX
66
#undef CHAR_INPUT
67
#define DEBUG_CALL_PRINTINT
68
#endif
69
70
/* Other debugging code can be enabled by these defines. */
71
72
/* #define DEBUG_SHOW_CAPTURES */
73
/* #define DEBUG_SHOW_PARSED */
74
75
/* There are a few things that vary with different code unit sizes. Handle them
76
by defining macros in order to minimize #if usage. */
77
78
#if PCRE2_CODE_UNIT_WIDTH == 8
79
#define STRING_UTFn_RIGHTPAR     STRING_UTF8_RIGHTPAR, 5
80
2.32k
#define XDIGIT(c)                xdigitab[c]
81
82
#else  /* Either 16-bit or 32-bit */
83
#define XDIGIT(c)                (MAX_255(c)? xdigitab[c] : 0xff)
84
85
#if PCRE2_CODE_UNIT_WIDTH == 16
86
#define STRING_UTFn_RIGHTPAR     STRING_UTF16_RIGHTPAR, 6
87
88
#else  /* 32-bit */
89
#define STRING_UTFn_RIGHTPAR     STRING_UTF32_RIGHTPAR, 6
90
#endif
91
#endif
92
93
/* Function definitions to allow mutual recursion */
94
95
static int
96
  compile_regex(uint32_t, uint32_t, PCRE2_UCHAR **, uint32_t **, int *,
97
    uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *, branch_chain *,
98
    open_capitem *, compile_block *, PCRE2_SIZE *);
99
100
static int
101
  get_branchlength(uint32_t **, int *, int *, int *, parsed_recurse_check *,
102
    compile_block *);
103
104
static BOOL
105
  set_lookbehind_lengths(uint32_t **, int *, int *, parsed_recurse_check *,
106
    compile_block *);
107
108
static int
109
  check_lookbehinds(uint32_t *, uint32_t **, parsed_recurse_check *,
110
    compile_block *, int *);
111
112
113
/*************************************************
114
*      Code parameters and static tables         *
115
*************************************************/
116
117
428k
#define MAX_GROUP_NUMBER   65535u
118
123k
#define MAX_REPEAT_COUNT   65535u
119
100k
#define REPEAT_UNLIMITED   (MAX_REPEAT_COUNT+1)
120
121
/* COMPILE_WORK_SIZE specifies the size of stack workspace, which is used in
122
different ways in the different pattern scans. The parsing and group-
123
identifying pre-scan uses it to handle nesting, and needs it to be 16-bit
124
aligned for this. Having defined the size in code units, we set up
125
C16_WORK_SIZE as the number of elements in the 16-bit vector.
126
127
During the first compiling phase, when determining how much memory is required,
128
the regex is partly compiled into this space, but the compiled parts are
129
discarded as soon as they can be, so that hopefully there will never be an
130
overrun. The code does, however, check for an overrun, which can occur for
131
pathological patterns. The size of the workspace depends on LINK_SIZE because
132
the length of compiled items varies with this.
133
134
In the real compile phase, this workspace is not currently used. */
135
136
11.1k
#define COMPILE_WORK_SIZE (3000*LINK_SIZE)   /* Size in code units */
137
138
#define C16_WORK_SIZE \
139
  ((COMPILE_WORK_SIZE * sizeof(PCRE2_UCHAR))/sizeof(uint16_t))
140
141
/* A uint32_t vector is used for caching information about the size of
142
capturing groups, to improve performance. A default is created on the stack of
143
this size. */
144
145
1.04k
#define GROUPINFO_DEFAULT_SIZE 256
146
147
/* The overrun tests check for a slightly smaller size so that they detect the
148
overrun before it actually does run off the end of the data block. */
149
150
191k
#define WORK_SIZE_SAFETY_MARGIN (100)
151
152
/* This value determines the size of the initial vector that is used for
153
remembering named groups during the pre-compile. It is allocated on the stack,
154
but if it is too small, it is expanded, in a similar way to the workspace. The
155
value is the number of slots in the list. */
156
157
22.4k
#define NAMED_GROUP_LIST_SIZE  20
158
159
/* The pre-compiling pass over the pattern creates a parsed pattern in a vector
160
of uint32_t. For short patterns this lives on the stack, with this size. Heap
161
memory is used for longer patterns. */
162
163
11.1k
#define PARSED_PATTERN_DEFAULT_SIZE 1024
164
165
/* Maximum length value to check against when making sure that the variable
166
that holds the compiled pattern length does not overflow. We make it a bit less
167
than INT_MAX to allow for adding in group terminating code units, so that we
168
don't have to check them every time. */
169
170
250k
#define OFLOW_MAX (INT_MAX - 20)
171
172
/* Table of extra lengths for each of the meta codes. Must be kept in step with
173
the definitions above. For some items these values are a basic length to which
174
a variable amount has to be added. */
175
176
static unsigned char meta_extra_lengths[] = {
177
  0,             /* META_END */
178
  0,             /* META_ALT */
179
  0,             /* META_ATOMIC */
180
  0,             /* META_BACKREF - more if group is >= 10 */
181
  1+SIZEOFFSET,  /* META_BACKREF_BYNAME */
182
  1,             /* META_BIGVALUE */
183
  3,             /* META_CALLOUT_NUMBER */
184
  3+SIZEOFFSET,  /* META_CALLOUT_STRING */
185
  0,             /* META_CAPTURE */
186
  0,             /* META_CIRCUMFLEX */
187
  0,             /* META_CLASS */
188
  0,             /* META_CLASS_EMPTY */
189
  0,             /* META_CLASS_EMPTY_NOT */
190
  0,             /* META_CLASS_END */
191
  0,             /* META_CLASS_NOT */
192
  0,             /* META_COND_ASSERT */
193
  SIZEOFFSET,    /* META_COND_DEFINE */
194
  1+SIZEOFFSET,  /* META_COND_NAME */
195
  1+SIZEOFFSET,  /* META_COND_NUMBER */
196
  1+SIZEOFFSET,  /* META_COND_RNAME */
197
  1+SIZEOFFSET,  /* META_COND_RNUMBER */
198
  3,             /* META_COND_VERSION */
199
  SIZEOFFSET,    /* META_OFFSET */
200
  0,             /* META_SCS */
201
  1,             /* META_CAPTURE_NAME */
202
  1,             /* META_CAPTURE_NUMBER */
203
  0,             /* META_DOLLAR */
204
  0,             /* META_DOT */
205
  0,             /* META_ESCAPE - one more for ESC_P and ESC_p */
206
  0,             /* META_KET */
207
  0,             /* META_NOCAPTURE */
208
  2,             /* META_OPTIONS */
209
  1,             /* META_POSIX */
210
  1,             /* META_POSIX_NEG */
211
  0,             /* META_RANGE_ESCAPED */
212
  0,             /* META_RANGE_LITERAL */
213
  SIZEOFFSET,    /* META_RECURSE */
214
  1+SIZEOFFSET,  /* META_RECURSE_BYNAME */
215
  0,             /* META_SCRIPT_RUN */
216
  0,             /* META_LOOKAHEAD */
217
  0,             /* META_LOOKAHEADNOT */
218
  SIZEOFFSET,    /* META_LOOKBEHIND */
219
  SIZEOFFSET,    /* META_LOOKBEHINDNOT */
220
  0,             /* META_LOOKAHEAD_NA */
221
  SIZEOFFSET,    /* META_LOOKBEHIND_NA */
222
  1,             /* META_MARK - plus the string length */
223
  0,             /* META_ACCEPT */
224
  0,             /* META_FAIL */
225
  0,             /* META_COMMIT */
226
  1,             /* META_COMMIT_ARG - plus the string length */
227
  0,             /* META_PRUNE */
228
  1,             /* META_PRUNE_ARG - plus the string length */
229
  0,             /* META_SKIP */
230
  1,             /* META_SKIP_ARG - plus the string length */
231
  0,             /* META_THEN */
232
  1,             /* META_THEN_ARG - plus the string length */
233
  0,             /* META_ASTERISK */
234
  0,             /* META_ASTERISK_PLUS */
235
  0,             /* META_ASTERISK_QUERY */
236
  0,             /* META_PLUS */
237
  0,             /* META_PLUS_PLUS */
238
  0,             /* META_PLUS_QUERY */
239
  0,             /* META_QUERY */
240
  0,             /* META_QUERY_PLUS */
241
  0,             /* META_QUERY_QUERY */
242
  2,             /* META_MINMAX */
243
  2,             /* META_MINMAX_PLUS */
244
  2,             /* META_MINMAX_QUERY */
245
  0,             /* META_ECLASS_AND */
246
  0,             /* META_ECLASS_OR */
247
  0,             /* META_ECLASS_SUB */
248
  0,             /* META_ECLASS_XOR */
249
  0              /* META_ECLASS_NOT */
250
};
251
252
/* Types for skipping parts of a parsed pattern. */
253
254
enum { PSKIP_ALT, PSKIP_CLASS, PSKIP_KET };
255
256
/* Values and flags for the unsigned xxcuflags variables that accompany xxcu
257
variables, which are concerned with first and required code units. A value
258
greater than or equal to REQ_NONE means "no code unit set"; otherwise the
259
matching xxcu variable is set, and the low valued bits are relevant. */
260
261
597k
#define REQ_UNSET     0xffffffffu  /* Not yet found anything */
262
257k
#define REQ_NONE      0xfffffffeu  /* Found not fixed character */
263
7.58k
#define REQ_CASELESS  0x00000001u  /* Code unit in xxcu is caseless */
264
86.3k
#define REQ_VARY      0x00000002u  /* Code unit is followed by non-literal */
265
266
/* These flags are used in the groupinfo vector. */
267
268
2.84k
#define GI_SET_FIXED_LENGTH    0x80000000u
269
1.92k
#define GI_NOT_FIXED_LENGTH    0x40000000u
270
94
#define GI_FIXED_LENGTH_MASK   0x0000ffffu
271
272
/* This simple test for a decimal digit works for both ASCII/Unicode and EBCDIC
273
and is fast (a good compiler can turn it into a subtraction and unsigned
274
comparison). */
275
276
232k
#define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
277
278
/* Table to identify hex digits. The tables in chartables are dependent on the
279
locale, and may mark arbitrary characters as digits. We want to recognize only
280
0-9, a-z, and A-Z as hex digits, which is why we have a private table here. It
281
costs 256 bytes, but it is a lot faster than doing character value tests (at
282
least in some simple cases I timed), and in some applications one wants PCRE2
283
to compile efficiently as well as match efficiently. The value in the table is
284
the binary hex digit value, or 0xff for non-hex digits. */
285
286
/* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
287
UTF-8 mode. */
288
289
#ifndef EBCDIC
290
static const uint8_t xdigitab[] =
291
  {
292
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7 */
293
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15 */
294
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 */
295
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31 */
296
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - '  */
297
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ( - /  */
298
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  */
299
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff, /*  8 - ?  */
300
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  @ - G  */
301
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H - O  */
302
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  P - W  */
303
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  X - _  */
304
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  ` - g  */
305
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h - o  */
306
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  p - w  */
307
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  x -127 */
308
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 128-135 */
309
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 136-143 */
310
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144-151 */
311
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 152-159 */
312
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160-167 */
313
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 168-175 */
314
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 176-183 */
315
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191 */
316
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 192-199 */
317
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 2ff-207 */
318
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 208-215 */
319
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 216-223 */
320
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 224-231 */
321
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 232-239 */
322
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 240-247 */
323
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};/* 248-255 */
324
325
#else
326
327
/* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
328
329
static const uint8_t xdigitab[] =
330
  {
331
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   0-  7  0 */
332
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*   8- 15    */
333
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  16- 23 10 */
334
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  24- 31    */
335
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  32- 39 20 */
336
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  40- 47    */
337
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  48- 55 30 */
338
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  56- 63    */
339
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*    - 71 40 */
340
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  72- |     */
341
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  & - 87 50 */
342
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  88- 95    */
343
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  - -103 60 */
344
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 104- ?     */
345
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 112-119 70 */
346
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 120- "     */
347
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /* 128- g  80 */
348
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  h -143    */
349
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 144- p  90 */
350
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  q -159    */
351
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 160- x  A0 */
352
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  y -175    */
353
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  ^ -183 B0 */
354
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /* 184-191    */
355
  0xff,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,0xff, /*  { - G  C0 */
356
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  H -207    */
357
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  } - P  D0 */
358
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Q -223    */
359
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  \ - X  E0 */
360
  0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, /*  Y -239    */
361
  0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, /*  0 - 7  F0 */
362
  0x08,0x09,0xff,0xff,0xff,0xff,0xff,0xff};/*  8 -255    */
363
#endif  /* EBCDIC */
364
365
366
/* Table for handling alphanumeric escaped characters. Positive returns are
367
simple data values; negative values are for special things like \d and so on.
368
Zero means further processing is needed (for things like \x), or the escape is
369
invalid. */
370
371
/* This is the "normal" table for ASCII systems or for EBCDIC systems running
372
in UTF-8 mode. It runs from '0' to 'z'. */
373
374
#ifndef EBCDIC
375
48.0k
#define ESCAPES_FIRST       CHAR_0
376
23.8k
#define ESCAPES_LAST        CHAR_z
377
104
#define UPPER_CASE(c)       (c-32)
378
379
static const short int escapes[] = {
380
    /* 0 */ 0,                       /* 1 */ 0,
381
    /* 2 */ 0,                       /* 3 */ 0,
382
    /* 4 */ 0,                       /* 5 */ 0,
383
    /* 6 */ 0,                       /* 7 */ 0,
384
    /* 8 */ 0,                       /* 9 */ 0,
385
    /* : */ ESCAPES_FIRST+0x0a,      /* ; */ ESCAPES_FIRST+0x0b,
386
    /* < */ ESCAPES_FIRST+0x0c,      /* = */ ESCAPES_FIRST+0x0d,
387
    /* > */ ESCAPES_FIRST+0x0e,      /* ? */ ESCAPES_FIRST+0x0f,
388
    /* @ */ ESCAPES_FIRST+0x10,      /* A */ -ESC_A,
389
    /* B */ -ESC_B,                  /* C */ -ESC_C,
390
    /* D */ -ESC_D,                  /* E */ -ESC_E,
391
    /* F */ 0,                       /* G */ -ESC_G,
392
    /* H */ -ESC_H,                  /* I */ 0,
393
    /* J */ 0,                       /* K */ -ESC_K,
394
    /* L */ 0,                       /* M */ 0,
395
    /* N */ -ESC_N,                  /* O */ 0,
396
    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
397
    /* R */ -ESC_R,                  /* S */ -ESC_S,
398
    /* T */ 0,                       /* U */ 0,
399
    /* V */ -ESC_V,                  /* W */ -ESC_W,
400
    /* X */ -ESC_X,                  /* Y */ 0,
401
    /* Z */ -ESC_Z,                  /* [ */ ESCAPES_FIRST+0x2b,
402
    /* \ */ ESCAPES_FIRST+0x2c,      /* ] */ ESCAPES_FIRST+0x2d,
403
    /* ^ */ ESCAPES_FIRST+0x2e,      /* _ */ ESCAPES_FIRST+0x2f,
404
    /* ` */ ESCAPES_FIRST+0x30,      /* a */ CHAR_BEL,
405
    /* b */ -ESC_b,                  /* c */ 0,
406
    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
407
    /* f */ CHAR_FF,                 /* g */ 0,
408
    /* h */ -ESC_h,                  /* i */ 0,
409
    /* j */ 0,                       /* k */ -ESC_k,
410
    /* l */ 0,                       /* m */ 0,
411
    /* n */ CHAR_LF,                 /* o */ 0,
412
    /* p */ -ESC_p,                  /* q */ 0,
413
    /* r */ CHAR_CR,                 /* s */ -ESC_s,
414
    /* t */ CHAR_HT,                 /* u */ 0,
415
    /* v */ -ESC_v,                  /* w */ -ESC_w,
416
    /* x */ 0,                       /* y */ 0,
417
    /* z */ -ESC_z
418
};
419
420
#else
421
422
/* This is the "abnormal" table for EBCDIC systems without UTF-8 support.
423
It runs from 'a' to '9'. Our EBCDIC support can be provided via the compiler,
424
which can interpret character literals like 'a' or '[' in an EBCDIC codepage;
425
in this case, there is wide variance between codepages on the interpretation of
426
characters between the letters ('[' and '{' and so on are placed in all sorts of
427
different positions in the table). Thankfully however, all EBCDIC codepages
428
place the letters and digits in the same location, so we hardcode that here.
429
Our EBCDIC support can also be provided via numeric literals instead of
430
character literals, so either way, 'CHAR_a' will be 0x81 when PCRE2 is compiled
431
in EBCDIC mode. */
432
433
#define ESCAPES_FIRST       CHAR_a
434
#define ESCAPES_LAST        CHAR_9
435
#define UPPER_CASE(c)       (c+64)
436
437
static const short int escapes[] = {
438
    /* 0x81 a */ CHAR_BEL,             /* 0x82 b */ -ESC_b,
439
    /* 0x83 c */ 0,                    /* 0x84 d */ -ESC_d,
440
    /* 0x85 e */ CHAR_ESC,             /* 0x86 f */ CHAR_FF,
441
    /* 0x87 g */ 0,                    /* 0x88 h */ -ESC_h,
442
    /* 0x89 i */ 0,                    /* 0x8a   */ ESCAPES_FIRST+0x09,
443
    /* 0x8b   */ ESCAPES_FIRST+0x0a,   /* 0x8c   */ ESCAPES_FIRST+0x0b,
444
    /* 0x8d   */ ESCAPES_FIRST+0x0c,   /* 0x8e   */ ESCAPES_FIRST+0x0d,
445
    /* 0x8f   */ ESCAPES_FIRST+0x0e,   /* 0x90   */ ESCAPES_FIRST+0x0f,
446
    /* 0x91 j */ 0,                    /* 0x92 k */ -ESC_k,
447
    /* 0x93 l */ 0,                    /* 0x94 m */ 0,
448
    /* 0x95 n */ CHAR_LF,              /* 0x96 o */ 0,
449
    /* 0x97 p */ -ESC_p,               /* 0x98 q */ 0,
450
    /* 0x99 r */ CHAR_CR,              /* 0x9a   */ ESCAPES_FIRST+0x19,
451
    /* 0x9b   */ ESCAPES_FIRST+0x1a,   /* 0x9c   */ ESCAPES_FIRST+0x1b,
452
    /* 0x9d   */ ESCAPES_FIRST+0x1c,   /* 0x9e   */ ESCAPES_FIRST+0x1d,
453
    /* 0x9f   */ ESCAPES_FIRST+0x1e,   /* 0xa0   */ ESCAPES_FIRST+0x1f,
454
    /* 0xa1   */ ESCAPES_FIRST+0x20,   /* 0xa2 s */ -ESC_s,
455
    /* 0xa3 t */ CHAR_HT,              /* 0xa4 u */ 0,
456
    /* 0xa5 v */ -ESC_v,               /* 0xa6 w */ -ESC_w,
457
    /* 0xa7 x */ 0,                    /* 0xa8 y */ 0,
458
    /* 0xa9 z */ -ESC_z,               /* 0xaa   */ ESCAPES_FIRST+0x29,
459
    /* 0xab   */ ESCAPES_FIRST+0x2a,   /* 0xac   */ ESCAPES_FIRST+0x2b,
460
    /* 0xad   */ ESCAPES_FIRST+0x2c,   /* 0xae   */ ESCAPES_FIRST+0x2d,
461
    /* 0xaf   */ ESCAPES_FIRST+0x2e,   /* 0xb0   */ ESCAPES_FIRST+0x2f,
462
    /* 0xb1   */ ESCAPES_FIRST+0x30,   /* 0xb2   */ ESCAPES_FIRST+0x31,
463
    /* 0xb3   */ ESCAPES_FIRST+0x32,   /* 0xb4   */ ESCAPES_FIRST+0x33,
464
    /* 0xb5   */ ESCAPES_FIRST+0x34,   /* 0xb6   */ ESCAPES_FIRST+0x35,
465
    /* 0xb7   */ ESCAPES_FIRST+0x36,   /* 0xb8   */ ESCAPES_FIRST+0x37,
466
    /* 0xb9   */ ESCAPES_FIRST+0x38,   /* 0xba   */ ESCAPES_FIRST+0x39,
467
    /* 0xbb   */ ESCAPES_FIRST+0x3a,   /* 0xbc   */ ESCAPES_FIRST+0x3b,
468
    /* 0xbd   */ ESCAPES_FIRST+0x3c,   /* 0xbe   */ ESCAPES_FIRST+0x3d,
469
    /* 0xbf   */ ESCAPES_FIRST+0x3e,   /* 0xc0   */ ESCAPES_FIRST+0x3f,
470
    /* 0xc1 A */ -ESC_A,               /* 0xc2 B */ -ESC_B,
471
    /* 0xc3 C */ -ESC_C,               /* 0xc4 D */ -ESC_D,
472
    /* 0xc5 E */ -ESC_E,               /* 0xc6 F */ 0,
473
    /* 0xc7 G */ -ESC_G,               /* 0xc8 H */ -ESC_H,
474
    /* 0xc9 I */ 0,                    /* 0xca   */ ESCAPES_FIRST+0x49,
475
    /* 0xcb   */ ESCAPES_FIRST+0x4a,   /* 0xcc   */ ESCAPES_FIRST+0x4b,
476
    /* 0xcd   */ ESCAPES_FIRST+0x4c,   /* 0xce   */ ESCAPES_FIRST+0x4d,
477
    /* 0xcf   */ ESCAPES_FIRST+0x4e,   /* 0xd0   */ ESCAPES_FIRST+0x4f,
478
    /* 0xd1 J */ 0,                    /* 0xd2 K */ -ESC_K,
479
    /* 0xd3 L */ 0,                    /* 0xd4 M */ 0,
480
    /* 0xd5 N */ -ESC_N,               /* 0xd6 O */ 0,
481
    /* 0xd7 P */ -ESC_P,               /* 0xd8 Q */ -ESC_Q,
482
    /* 0xd9 R */ -ESC_R,               /* 0xda   */ ESCAPES_FIRST+0x59,
483
    /* 0xdb   */ ESCAPES_FIRST+0x5a,   /* 0xdc   */ ESCAPES_FIRST+0x5b,
484
    /* 0xdd   */ ESCAPES_FIRST+0x5c,   /* 0xde   */ ESCAPES_FIRST+0x5d,
485
    /* 0xdf   */ ESCAPES_FIRST+0x5e,   /* 0xe0   */ ESCAPES_FIRST+0x5f,
486
    /* 0xe1   */ ESCAPES_FIRST+0x60,   /* 0xe2 S */ -ESC_S,
487
    /* 0xe3 T */ 0,                    /* 0xe4 U */ 0,
488
    /* 0xe5 V */ -ESC_V,               /* 0xe6 W */ -ESC_W,
489
    /* 0xe7 X */ -ESC_X,               /* 0xe8 Y */ 0,
490
    /* 0xe9 Z */ -ESC_Z,               /* 0xea   */ ESCAPES_FIRST+0x69,
491
    /* 0xeb   */ ESCAPES_FIRST+0x6a,   /* 0xec   */ ESCAPES_FIRST+0x6b,
492
    /* 0xed   */ ESCAPES_FIRST+0x6c,   /* 0xee   */ ESCAPES_FIRST+0x6d,
493
    /* 0xef   */ ESCAPES_FIRST+0x6e,   /* 0xf0 0 */ 0,
494
    /* 0xf1 1 */ 0,                    /* 0xf2 2 */ 0,
495
    /* 0xf3 3 */ 0,                    /* 0xf4 4 */ 0,
496
    /* 0xf5 5 */ 0,                    /* 0xf6 6 */ 0,
497
    /* 0xf7 7 */ 0,                    /* 0xf8 8 */ 0,
498
    /* 0xf9 9 */ 0,
499
};
500
501
/* We also need a table of characters that may follow \c in an EBCDIC
502
environment for characters 0-31. */
503
504
static unsigned char ebcdic_escape_c[] = {
505
  CHAR_COMMERCIAL_AT, CHAR_A, CHAR_B, CHAR_C, CHAR_D, CHAR_E, CHAR_F, CHAR_G,
506
  CHAR_H, CHAR_I, CHAR_J, CHAR_K, CHAR_L, CHAR_M, CHAR_N, CHAR_O, CHAR_P,
507
  CHAR_Q, CHAR_R, CHAR_S, CHAR_T, CHAR_U, CHAR_V, CHAR_W, CHAR_X, CHAR_Y,
508
  CHAR_Z, CHAR_LEFT_SQUARE_BRACKET, CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
509
  CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE
510
};
511
512
#endif   /* EBCDIC */
513
514
515
/* Table of special "verbs" like (*PRUNE). This is a short table, so it is
516
searched linearly. Put all the names into a single string, in order to reduce
517
the number of relocations when a shared library is dynamically linked. The
518
string is built from string macros so that it works in UTF-8 mode on EBCDIC
519
platforms. */
520
521
typedef struct verbitem {
522
  unsigned int len;          /* Length of verb name */
523
  uint32_t meta;             /* Base META_ code */
524
  int has_arg;               /* Argument requirement */
525
} verbitem;
526
527
static const char verbnames[] =
528
  "\0"                       /* Empty name is a shorthand for MARK */
529
  STRING_MARK0
530
  STRING_ACCEPT0
531
  STRING_F0
532
  STRING_FAIL0
533
  STRING_COMMIT0
534
  STRING_PRUNE0
535
  STRING_SKIP0
536
  STRING_THEN;
537
538
static const verbitem verbs[] = {
539
  { 0, META_MARK,   +1 },  /* > 0 => must have an argument */
540
  { 4, META_MARK,   +1 },
541
  { 6, META_ACCEPT, -1 },  /* < 0 => Optional argument, convert to pre-MARK */
542
  { 1, META_FAIL,   -1 },
543
  { 4, META_FAIL,   -1 },
544
  { 6, META_COMMIT,  0 },
545
  { 5, META_PRUNE,   0 },  /* Optional argument; bump META code if found */
546
  { 4, META_SKIP,    0 },
547
  { 4, META_THEN,    0 }
548
};
549
550
static const int verbcount = sizeof(verbs)/sizeof(verbitem);
551
552
/* Verb opcodes, indexed by their META code offset from META_MARK. */
553
554
static const uint32_t verbops[] = {
555
  OP_MARK, OP_ACCEPT, OP_FAIL, OP_COMMIT, OP_COMMIT_ARG, OP_PRUNE,
556
  OP_PRUNE_ARG, OP_SKIP, OP_SKIP_ARG, OP_THEN, OP_THEN_ARG };
557
558
/* Table of "alpha assertions" like (*pla:...), similar to the (*VERB) table. */
559
560
typedef struct alasitem {
561
  unsigned int len;          /* Length of name */
562
  uint32_t meta;             /* Base META_ code */
563
} alasitem;
564
565
static const char alasnames[] =
566
  STRING_pla0
567
  STRING_plb0
568
  STRING_napla0
569
  STRING_naplb0
570
  STRING_nla0
571
  STRING_nlb0
572
  STRING_positive_lookahead0
573
  STRING_positive_lookbehind0
574
  STRING_non_atomic_positive_lookahead0
575
  STRING_non_atomic_positive_lookbehind0
576
  STRING_negative_lookahead0
577
  STRING_negative_lookbehind0
578
  STRING_scs0
579
  STRING_scan_substring0
580
  STRING_atomic0
581
  STRING_sr0
582
  STRING_asr0
583
  STRING_script_run0
584
  STRING_atomic_script_run;
585
586
static const alasitem alasmeta[] = {
587
  {  3, META_LOOKAHEAD         },
588
  {  3, META_LOOKBEHIND        },
589
  {  5, META_LOOKAHEAD_NA      },
590
  {  5, META_LOOKBEHIND_NA     },
591
  {  3, META_LOOKAHEADNOT      },
592
  {  3, META_LOOKBEHINDNOT     },
593
  { 18, META_LOOKAHEAD         },
594
  { 19, META_LOOKBEHIND        },
595
  { 29, META_LOOKAHEAD_NA      },
596
  { 30, META_LOOKBEHIND_NA     },
597
  { 18, META_LOOKAHEADNOT      },
598
  { 19, META_LOOKBEHINDNOT     },
599
  {  3, META_SCS               },
600
  { 14, META_SCS               },
601
  {  6, META_ATOMIC            },
602
  {  2, META_SCRIPT_RUN        }, /* sr = script run */
603
  {  3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
604
  { 10, META_SCRIPT_RUN        }, /* script run */
605
  { 17, META_ATOMIC_SCRIPT_RUN }  /* atomic script run */
606
};
607
608
static const int alascount = sizeof(alasmeta)/sizeof(alasitem);
609
610
/* Offsets from OP_STAR for case-independent and negative repeat opcodes. */
611
612
static uint32_t chartypeoffset[] = {
613
  OP_STAR - OP_STAR,    OP_STARI - OP_STAR,
614
  OP_NOTSTAR - OP_STAR, OP_NOTSTARI - OP_STAR };
615
616
/* Tables of names of POSIX character classes and their lengths. The names are
617
now all in a single string, to reduce the number of relocations when a shared
618
library is dynamically loaded. The list of lengths is terminated by a zero
619
length entry. The first three must be alpha, lower, upper, as this is assumed
620
for handling case independence.
621
622
The indices for several classes are stored in pcre2_compile.h - these must
623
be kept in sync with posix_names, posix_name_lengths, posix_class_maps,
624
and posix_substitutes. */
625
626
static const char posix_names[] =
627
  STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
628
  STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
629
  STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
630
  STRING_word0  STRING_xdigit;
631
632
static const uint8_t posix_name_lengths[] = {
633
  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
634
635
/* Table of class bit maps for each POSIX class. Each class is formed from a
636
base map, with an optional addition or removal of another map. Then, for some
637
classes, there is some additional tweaking: for [:blank:] the vertical space
638
characters are removed, and for [:alpha:] and [:alnum:] the underscore
639
character is removed. The triples in the table consist of the base map offset,
640
second map offset or -1 if no second map, and a non-negative value for map
641
addition or a negative value for map subtraction (if there are two maps). The
642
absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
643
remove vertical space characters, 2 => remove underscore. */
644
645
const int PRIV(posix_class_maps)[] = {
646
  cbit_word,   cbit_digit, -2,            /* alpha */
647
  cbit_lower,  -1,          0,            /* lower */
648
  cbit_upper,  -1,          0,            /* upper */
649
  cbit_word,   -1,          2,            /* alnum - word without underscore */
650
  cbit_print,  cbit_cntrl,  0,            /* ascii */
651
  cbit_space,  -1,          1,            /* blank - a GNU extension */
652
  cbit_cntrl,  -1,          0,            /* cntrl */
653
  cbit_digit,  -1,          0,            /* digit */
654
  cbit_graph,  -1,          0,            /* graph */
655
  cbit_print,  -1,          0,            /* print */
656
  cbit_punct,  -1,          0,            /* punct */
657
  cbit_space,  -1,          0,            /* space */
658
  cbit_word,   -1,          0,            /* word - a Perl extension */
659
  cbit_xdigit, -1,          0             /* xdigit */
660
};
661
662
#ifdef SUPPORT_UNICODE
663
664
/* The POSIX class Unicode property substitutes that are used in UCP mode must
665
be in the order of the POSIX class names, defined above. */
666
667
static int posix_substitutes[] = {
668
  PT_GC, ucp_L,     /* alpha */
669
  PT_PC, ucp_Ll,    /* lower */
670
  PT_PC, ucp_Lu,    /* upper */
671
  PT_ALNUM, 0,      /* alnum */
672
  -1, 0,            /* ascii, treat as non-UCP */
673
  -1, 1,            /* blank, treat as \h */
674
  PT_PC, ucp_Cc,    /* cntrl */
675
  PT_PC, ucp_Nd,    /* digit */
676
  PT_PXGRAPH, 0,    /* graph */
677
  PT_PXPRINT, 0,    /* print */
678
  PT_PXPUNCT, 0,    /* punct */
679
  PT_PXSPACE, 0,    /* space */   /* Xps is POSIX space, but from 8.34 */
680
  PT_WORD, 0,       /* word  */   /* Perl and POSIX space are the same */
681
  PT_PXXDIGIT, 0    /* xdigit */  /* Perl has additional hex digits */
682
};
683
#endif  /* SUPPORT_UNICODE */
684
685
/* Masks for checking option settings. When PCRE2_LITERAL is set, only a subset
686
are allowed. */
687
688
#define PUBLIC_LITERAL_COMPILE_OPTIONS \
689
11.1k
  (PCRE2_ANCHORED|PCRE2_AUTO_CALLOUT|PCRE2_CASELESS|PCRE2_ENDANCHORED| \
690
11.1k
   PCRE2_FIRSTLINE|PCRE2_LITERAL|PCRE2_MATCH_INVALID_UTF| \
691
11.1k
   PCRE2_NO_START_OPTIMIZE|PCRE2_NO_UTF_CHECK|PCRE2_USE_OFFSET_LIMIT|PCRE2_UTF)
692
693
#define PUBLIC_COMPILE_OPTIONS \
694
11.1k
  (PUBLIC_LITERAL_COMPILE_OPTIONS| \
695
11.1k
   PCRE2_ALLOW_EMPTY_CLASS|PCRE2_ALT_BSUX|PCRE2_ALT_CIRCUMFLEX| \
696
11.1k
   PCRE2_ALT_VERBNAMES|PCRE2_DOLLAR_ENDONLY|PCRE2_DOTALL|PCRE2_DUPNAMES| \
697
11.1k
   PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MATCH_UNSET_BACKREF| \
698
11.1k
   PCRE2_MULTILINE|PCRE2_NEVER_BACKSLASH_C|PCRE2_NEVER_UCP| \
699
11.1k
   PCRE2_NEVER_UTF|PCRE2_NO_AUTO_CAPTURE|PCRE2_NO_AUTO_POSSESS| \
700
11.1k
   PCRE2_NO_DOTSTAR_ANCHOR|PCRE2_UCP|PCRE2_UNGREEDY|PCRE2_ALT_EXTENDED_CLASS)
701
702
#define PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS \
703
11.1k
   (PCRE2_EXTRA_MATCH_LINE|PCRE2_EXTRA_MATCH_WORD| \
704
11.1k
    PCRE2_EXTRA_CASELESS_RESTRICT|PCRE2_EXTRA_TURKISH_CASING)
705
706
#define PUBLIC_COMPILE_EXTRA_OPTIONS \
707
11.1k
   (PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS| \
708
11.1k
    PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES|PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL| \
709
11.1k
    PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
710
11.1k
    PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
711
11.1k
    PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
712
11.1k
    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0| \
713
11.1k
    PCRE2_EXTRA_NEVER_CALLOUT)
714
715
/* This is a table of start-of-pattern options such as (*UTF) and settings such
716
as (*LIMIT_MATCH=nnnn) and (*CRLF). For completeness and backward
717
compatibility, (*UTFn) is supported in the relevant libraries, but (*UTF) is
718
generic and always supported. */
719
720
enum { PSO_OPT,     /* Value is an option bit */
721
       PSO_XOPT,    /* Value is an xoption bit */
722
       PSO_FLG,     /* Value is a flag bit */
723
       PSO_NL,      /* Value is a newline type */
724
       PSO_BSR,     /* Value is a \R type */
725
       PSO_LIMH,    /* Read integer value for heap limit */
726
       PSO_LIMM,    /* Read integer value for match limit */
727
       PSO_LIMD,    /* Read integer value for depth limit */
728
       PSO_OPTMZ    /* Value is an optimization bit */
729
     };
730
731
typedef struct pso {
732
  const char *name;
733
  uint16_t length;
734
  uint16_t type;
735
  uint32_t value;
736
} pso;
737
738
/* NB: STRING_UTFn_RIGHTPAR contains the length as well */
739
740
static const pso pso_list[] = {
741
  { STRING_UTFn_RIGHTPAR,                  PSO_OPT, PCRE2_UTF },
742
  { STRING_UTF_RIGHTPAR,                4, PSO_OPT, PCRE2_UTF },
743
  { STRING_UCP_RIGHTPAR,                4, PSO_OPT, PCRE2_UCP },
744
  { STRING_NOTEMPTY_RIGHTPAR,           9, PSO_FLG, PCRE2_NOTEMPTY_SET },
745
  { STRING_NOTEMPTY_ATSTART_RIGHTPAR,  17, PSO_FLG, PCRE2_NE_ATST_SET },
746
  { STRING_NO_AUTO_POSSESS_RIGHTPAR,   16, PSO_OPTMZ, PCRE2_OPTIM_AUTO_POSSESS },
747
  { STRING_NO_DOTSTAR_ANCHOR_RIGHTPAR, 18, PSO_OPTMZ, PCRE2_OPTIM_DOTSTAR_ANCHOR },
748
  { STRING_NO_JIT_RIGHTPAR,             7, PSO_FLG, PCRE2_NOJIT },
749
  { STRING_NO_START_OPT_RIGHTPAR,      13, PSO_OPTMZ, PCRE2_OPTIM_START_OPTIMIZE },
750
  { STRING_CASELESS_RESTRICT_RIGHTPAR, 18, PSO_XOPT, PCRE2_EXTRA_CASELESS_RESTRICT },
751
  { STRING_TURKISH_CASING_RIGHTPAR,    15, PSO_XOPT, PCRE2_EXTRA_TURKISH_CASING },
752
  { STRING_LIMIT_HEAP_EQ,              11, PSO_LIMH, 0 },
753
  { STRING_LIMIT_MATCH_EQ,             12, PSO_LIMM, 0 },
754
  { STRING_LIMIT_DEPTH_EQ,             12, PSO_LIMD, 0 },
755
  { STRING_LIMIT_RECURSION_EQ,         16, PSO_LIMD, 0 },
756
  { STRING_CR_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_CR },
757
  { STRING_LF_RIGHTPAR,                 3, PSO_NL,  PCRE2_NEWLINE_LF },
758
  { STRING_CRLF_RIGHTPAR,               5, PSO_NL,  PCRE2_NEWLINE_CRLF },
759
  { STRING_ANY_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_ANY },
760
  { STRING_NUL_RIGHTPAR,                4, PSO_NL,  PCRE2_NEWLINE_NUL },
761
  { STRING_ANYCRLF_RIGHTPAR,            8, PSO_NL,  PCRE2_NEWLINE_ANYCRLF },
762
  { STRING_BSR_ANYCRLF_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_ANYCRLF },
763
  { STRING_BSR_UNICODE_RIGHTPAR,       12, PSO_BSR, PCRE2_BSR_UNICODE }
764
};
765
766
/* This table is used when converting repeating opcodes into possessified
767
versions as a result of an explicit possessive quantifier such as ++. A zero
768
value means there is no possessified version - in those cases the item in
769
question must be wrapped in ONCE brackets. The table is truncated at OP_CALLOUT
770
because all relevant opcodes are less than that. */
771
772
static const uint8_t opcode_possessify[] = {
773
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 0 - 15  */
774
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,   /* 16 - 31 */
775
776
  0,                       /* NOTI */
777
  OP_POSSTAR, 0,           /* STAR, MINSTAR */
778
  OP_POSPLUS, 0,           /* PLUS, MINPLUS */
779
  OP_POSQUERY, 0,          /* QUERY, MINQUERY */
780
  OP_POSUPTO, 0,           /* UPTO, MINUPTO */
781
  0,                       /* EXACT */
782
  0, 0, 0, 0,              /* POS{STAR,PLUS,QUERY,UPTO} */
783
784
  OP_POSSTARI, 0,          /* STARI, MINSTARI */
785
  OP_POSPLUSI, 0,          /* PLUSI, MINPLUSI */
786
  OP_POSQUERYI, 0,         /* QUERYI, MINQUERYI */
787
  OP_POSUPTOI, 0,          /* UPTOI, MINUPTOI */
788
  0,                       /* EXACTI */
789
  0, 0, 0, 0,              /* POS{STARI,PLUSI,QUERYI,UPTOI} */
790
791
  OP_NOTPOSSTAR, 0,        /* NOTSTAR, NOTMINSTAR */
792
  OP_NOTPOSPLUS, 0,        /* NOTPLUS, NOTMINPLUS */
793
  OP_NOTPOSQUERY, 0,       /* NOTQUERY, NOTMINQUERY */
794
  OP_NOTPOSUPTO, 0,        /* NOTUPTO, NOTMINUPTO */
795
  0,                       /* NOTEXACT */
796
  0, 0, 0, 0,              /* NOTPOS{STAR,PLUS,QUERY,UPTO} */
797
798
  OP_NOTPOSSTARI, 0,       /* NOTSTARI, NOTMINSTARI */
799
  OP_NOTPOSPLUSI, 0,       /* NOTPLUSI, NOTMINPLUSI */
800
  OP_NOTPOSQUERYI, 0,      /* NOTQUERYI, NOTMINQUERYI */
801
  OP_NOTPOSUPTOI, 0,       /* NOTUPTOI, NOTMINUPTOI */
802
  0,                       /* NOTEXACTI */
803
  0, 0, 0, 0,              /* NOTPOS{STARI,PLUSI,QUERYI,UPTOI} */
804
805
  OP_TYPEPOSSTAR, 0,       /* TYPESTAR, TYPEMINSTAR */
806
  OP_TYPEPOSPLUS, 0,       /* TYPEPLUS, TYPEMINPLUS */
807
  OP_TYPEPOSQUERY, 0,      /* TYPEQUERY, TYPEMINQUERY */
808
  OP_TYPEPOSUPTO, 0,       /* TYPEUPTO, TYPEMINUPTO */
809
  0,                       /* TYPEEXACT */
810
  0, 0, 0, 0,              /* TYPEPOS{STAR,PLUS,QUERY,UPTO} */
811
812
  OP_CRPOSSTAR, 0,         /* CRSTAR, CRMINSTAR */
813
  OP_CRPOSPLUS, 0,         /* CRPLUS, CRMINPLUS */
814
  OP_CRPOSQUERY, 0,        /* CRQUERY, CRMINQUERY */
815
  OP_CRPOSRANGE, 0,        /* CRRANGE, CRMINRANGE */
816
  0, 0, 0, 0,              /* CRPOS{STAR,PLUS,QUERY,RANGE} */
817
818
  0, 0, 0, 0,              /* CLASS, NCLASS, XCLASS, ECLASS */
819
  0, 0,                    /* REF, REFI */
820
  0, 0,                    /* DNREF, DNREFI */
821
  0, 0,                    /* RECURSE, CALLOUT */
822
};
823
824
/* Compile-time check that the table has the correct size. */
825
STATIC_ASSERT(sizeof(opcode_possessify) == OP_CALLOUT+1, opcode_possessify);
826
827
828
#ifdef DEBUG_SHOW_PARSED
829
/*************************************************
830
*     Show the parsed pattern for debugging      *
831
*************************************************/
832
833
/* For debugging the pre-scan, this code, which outputs the parsed data vector,
834
can be enabled. */
835
836
static void show_parsed(compile_block *cb)
837
{
838
uint32_t *pptr = cb->parsed_pattern;
839
840
for (;;)
841
  {
842
  int max, min;
843
  PCRE2_SIZE offset;
844
  uint32_t i;
845
  uint32_t length;
846
  uint32_t meta_arg = META_DATA(*pptr);
847
848
  fprintf(stderr, "+++ %02d %.8x ", (int)(pptr - cb->parsed_pattern), *pptr);
849
850
  if (*pptr < META_END)
851
    {
852
    if (*pptr > 32 && *pptr < 128) fprintf(stderr, "%c", *pptr);
853
    pptr++;
854
    }
855
856
  else switch (META_CODE(*pptr++))
857
    {
858
    default:
859
    fprintf(stderr, "**** OOPS - unknown META value - giving up ****\n");
860
    return;
861
862
    case META_END:
863
    fprintf(stderr, "META_END\n");
864
    return;
865
866
    case META_CAPTURE:
867
    fprintf(stderr, "META_CAPTURE %d", meta_arg);
868
    break;
869
870
    case META_RECURSE:
871
    GETOFFSET(offset, pptr);
872
    fprintf(stderr, "META_RECURSE %d %zd", meta_arg, offset);
873
    break;
874
875
    case META_BACKREF:
876
    if (meta_arg < 10)
877
      offset = cb->small_ref_offset[meta_arg];
878
    else
879
      GETOFFSET(offset, pptr);
880
    fprintf(stderr, "META_BACKREF %d %zd", meta_arg, offset);
881
    break;
882
883
    case META_ESCAPE:
884
    if (meta_arg == ESC_P || meta_arg == ESC_p)
885
      {
886
      uint32_t ptype = *pptr >> 16;
887
      uint32_t pvalue = *pptr++ & 0xffff;
888
      fprintf(stderr, "META \\%c %d %d", (meta_arg == ESC_P)? CHAR_P:CHAR_p,
889
        ptype, pvalue);
890
      }
891
    else
892
      {
893
      uint32_t cc;
894
      /* There's just one escape we might have here that isn't negated in the
895
      escapes table. */
896
      if (meta_arg == ESC_g) cc = CHAR_g;
897
      else for (cc = ESCAPES_FIRST; cc <= ESCAPES_LAST; cc++)
898
        {
899
        if (meta_arg == (uint32_t)(-escapes[cc - ESCAPES_FIRST])) break;
900
        }
901
      if (cc > ESCAPES_LAST) cc = CHAR_QUESTION_MARK;
902
      fprintf(stderr, "META \\%c", cc);
903
      }
904
    break;
905
906
    case META_MINMAX:
907
    min = *pptr++;
908
    max = *pptr++;
909
    if (max != REPEAT_UNLIMITED)
910
      fprintf(stderr, "META {%d,%d}", min, max);
911
    else
912
      fprintf(stderr, "META {%d,}", min);
913
    break;
914
915
    case META_MINMAX_QUERY:
916
    min = *pptr++;
917
    max = *pptr++;
918
    if (max != REPEAT_UNLIMITED)
919
      fprintf(stderr, "META {%d,%d}?", min, max);
920
    else
921
      fprintf(stderr, "META {%d,}?", min);
922
    break;
923
924
    case META_MINMAX_PLUS:
925
    min = *pptr++;
926
    max = *pptr++;
927
    if (max != REPEAT_UNLIMITED)
928
      fprintf(stderr, "META {%d,%d}+", min, max);
929
    else
930
      fprintf(stderr, "META {%d,}+", min);
931
    break;
932
933
    case META_BIGVALUE: fprintf(stderr, "META_BIGVALUE %.8x", *pptr++); break;
934
    case META_CIRCUMFLEX: fprintf(stderr, "META_CIRCUMFLEX"); break;
935
    case META_COND_ASSERT: fprintf(stderr, "META_COND_ASSERT"); break;
936
    case META_DOLLAR: fprintf(stderr, "META_DOLLAR"); break;
937
    case META_DOT: fprintf(stderr, "META_DOT"); break;
938
    case META_ASTERISK: fprintf(stderr, "META *"); break;
939
    case META_ASTERISK_QUERY: fprintf(stderr, "META *?"); break;
940
    case META_ASTERISK_PLUS: fprintf(stderr, "META *+"); break;
941
    case META_PLUS: fprintf(stderr, "META +"); break;
942
    case META_PLUS_QUERY: fprintf(stderr, "META +?"); break;
943
    case META_PLUS_PLUS: fprintf(stderr, "META ++"); break;
944
    case META_QUERY: fprintf(stderr, "META ?"); break;
945
    case META_QUERY_QUERY: fprintf(stderr, "META ??"); break;
946
    case META_QUERY_PLUS: fprintf(stderr, "META ?+"); break;
947
948
    case META_ATOMIC: fprintf(stderr, "META (?>"); break;
949
    case META_NOCAPTURE: fprintf(stderr, "META (?:"); break;
950
    case META_LOOKAHEAD: fprintf(stderr, "META (?="); break;
951
    case META_LOOKAHEADNOT: fprintf(stderr, "META (?!"); break;
952
    case META_LOOKAHEAD_NA: fprintf(stderr, "META (*napla:"); break;
953
    case META_SCRIPT_RUN: fprintf(stderr, "META (*sr:"); break;
954
    case META_KET: fprintf(stderr, "META )"); break;
955
    case META_ALT: fprintf(stderr, "META | %d", meta_arg); break;
956
957
    case META_CLASS: fprintf(stderr, "META ["); break;
958
    case META_CLASS_NOT: fprintf(stderr, "META [^"); break;
959
    case META_CLASS_END: fprintf(stderr, "META ]"); break;
960
    case META_CLASS_EMPTY: fprintf(stderr, "META []"); break;
961
    case META_CLASS_EMPTY_NOT: fprintf(stderr, "META [^]"); break;
962
963
    case META_RANGE_LITERAL: fprintf(stderr, "META - (literal)"); break;
964
    case META_RANGE_ESCAPED: fprintf(stderr, "META - (escaped)"); break;
965
966
    case META_POSIX: fprintf(stderr, "META_POSIX %d", *pptr++); break;
967
    case META_POSIX_NEG: fprintf(stderr, "META_POSIX_NEG %d", *pptr++); break;
968
969
    case META_ACCEPT: fprintf(stderr, "META (*ACCEPT)"); break;
970
    case META_FAIL: fprintf(stderr, "META (*FAIL)"); break;
971
    case META_COMMIT: fprintf(stderr, "META (*COMMIT)"); break;
972
    case META_PRUNE: fprintf(stderr, "META (*PRUNE)"); break;
973
    case META_SKIP: fprintf(stderr, "META (*SKIP)"); break;
974
    case META_THEN: fprintf(stderr, "META (*THEN)"); break;
975
976
    case META_OPTIONS:
977
    fprintf(stderr, "META_OPTIONS 0x%08x 0x%08x", pptr[0], pptr[1]);
978
    pptr += 2;
979
    break;
980
981
    case META_LOOKBEHIND:
982
    fprintf(stderr, "META (?<= %d %d", meta_arg, *pptr);
983
    pptr += 2;
984
    break;
985
986
    case META_LOOKBEHIND_NA:
987
    fprintf(stderr, "META (*naplb: %d %d", meta_arg, *pptr);
988
    pptr += 2;
989
    break;
990
991
    case META_LOOKBEHINDNOT:
992
    fprintf(stderr, "META (?<! %d %d", meta_arg, *pptr);
993
    pptr += 2;
994
    break;
995
996
    case META_CALLOUT_NUMBER:
997
    fprintf(stderr, "META (?C%d) next=%d/%d", pptr[2], pptr[0],
998
       pptr[1]);
999
    pptr += 3;
1000
    break;
1001
1002
    case META_CALLOUT_STRING:
1003
      {
1004
      uint32_t patoffset = *pptr++;    /* Offset of next pattern item */
1005
      uint32_t patlength = *pptr++;    /* Length of next pattern item */
1006
      fprintf(stderr, "META (?Cstring) length=%d offset=", *pptr++);
1007
      GETOFFSET(offset, pptr);
1008
      fprintf(stderr, "%zd next=%d/%d", offset, patoffset, patlength);
1009
      }
1010
    break;
1011
1012
    case META_RECURSE_BYNAME:
1013
    fprintf(stderr, "META (?(&name) length=%d offset=", *pptr++);
1014
    GETOFFSET(offset, pptr);
1015
    fprintf(stderr, "%zd", offset);
1016
    break;
1017
1018
    case META_BACKREF_BYNAME:
1019
    fprintf(stderr, "META_BACKREF_BYNAME length=%d offset=", *pptr++);
1020
    GETOFFSET(offset, pptr);
1021
    fprintf(stderr, "%zd", offset);
1022
    break;
1023
1024
    case META_COND_NUMBER:
1025
    fprintf(stderr, "META_COND_NUMBER %d offset=", pptr[SIZEOFFSET]);
1026
    GETOFFSET(offset, pptr);
1027
    fprintf(stderr, "%zd", offset);
1028
    pptr++;
1029
    break;
1030
1031
    case META_COND_DEFINE:
1032
    fprintf(stderr, "META (?(DEFINE) offset=");
1033
    GETOFFSET(offset, pptr);
1034
    fprintf(stderr, "%zd", offset);
1035
    break;
1036
1037
    case META_COND_VERSION:
1038
    fprintf(stderr, "META (?(VERSION%s", (*pptr++ == 0)? "=" : ">=");
1039
    fprintf(stderr, "%d.", *pptr++);
1040
    fprintf(stderr, "%d)", *pptr++);
1041
    break;
1042
1043
    case META_COND_NAME:
1044
    fprintf(stderr, "META (?(<name>) length=%d offset=", *pptr++);
1045
    GETOFFSET(offset, pptr);
1046
    fprintf(stderr, "%zd", offset);
1047
    break;
1048
1049
    case META_COND_RNAME:
1050
    fprintf(stderr, "META (?(R&name) length=%d offset=", *pptr++);
1051
    GETOFFSET(offset, pptr);
1052
    fprintf(stderr, "%zd", offset);
1053
    break;
1054
1055
    /* This is kept as a name, because it might be. */
1056
1057
    case META_COND_RNUMBER:
1058
    fprintf(stderr, "META (?(Rnumber) length=%d offset=", *pptr++);
1059
    GETOFFSET(offset, pptr);
1060
    fprintf(stderr, "%zd", offset);
1061
    break;
1062
1063
    case META_OFFSET:
1064
    fprintf(stderr, "META_OFFSET offset=");
1065
    GETOFFSET(offset, pptr);
1066
    fprintf(stderr, "%zd", offset);
1067
    break;
1068
1069
    case META_SCS:
1070
    fprintf(stderr, "META (*scan_substring:");
1071
    break;
1072
1073
    case META_CAPTURE_NAME:
1074
    fprintf(stderr, "META_CAPTURE_NAME length=%d relative_offset=%d", *pptr++, (int)meta_arg);
1075
    break;
1076
1077
    case META_CAPTURE_NUMBER:
1078
    fprintf(stderr, "META_CAPTURE_NUMBER %d relative_offset=%d", *pptr++, (int)meta_arg);
1079
    break;
1080
1081
    case META_MARK:
1082
    fprintf(stderr, "META (*MARK:");
1083
    goto SHOWARG;
1084
1085
    case META_COMMIT_ARG:
1086
    fprintf(stderr, "META (*COMMIT:");
1087
    goto SHOWARG;
1088
1089
    case META_PRUNE_ARG:
1090
    fprintf(stderr, "META (*PRUNE:");
1091
    goto SHOWARG;
1092
1093
    case META_SKIP_ARG:
1094
    fprintf(stderr, "META (*SKIP:");
1095
    goto SHOWARG;
1096
1097
    case META_THEN_ARG:
1098
    fprintf(stderr, "META (*THEN:");
1099
    SHOWARG:
1100
    length = *pptr++;
1101
    for (i = 0; i < length; i++)
1102
      {
1103
      uint32_t cc = *pptr++;
1104
      if (cc > 32 && cc < 128) fprintf(stderr, "%c", cc);
1105
        else fprintf(stderr, "\\x{%x}", cc);
1106
      }
1107
    fprintf(stderr, ") length=%u", length);
1108
    break;
1109
1110
    case META_ECLASS_AND: fprintf(stderr, "META_ECLASS_AND"); break;
1111
    case META_ECLASS_OR: fprintf(stderr, "META_ECLASS_OR"); break;
1112
    case META_ECLASS_SUB: fprintf(stderr, "META_ECLASS_SUB"); break;
1113
    case META_ECLASS_XOR: fprintf(stderr, "META_ECLASS_XOR"); break;
1114
    case META_ECLASS_NOT: fprintf(stderr, "META_ECLASS_NOT"); break;
1115
    }
1116
  fprintf(stderr, "\n");
1117
  }
1118
return;
1119
}
1120
#endif  /* DEBUG_SHOW_PARSED */
1121
1122
1123
1124
/*************************************************
1125
*               Copy compiled code               *
1126
*************************************************/
1127
1128
/* Compiled JIT code cannot be copied, so the new compiled block has no
1129
associated JIT data. */
1130
1131
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1132
pcre2_code_copy(const pcre2_code *code)
1133
0
{
1134
0
PCRE2_SIZE *ref_count;
1135
0
pcre2_code *newcode;
1136
1137
0
if (code == NULL) return NULL;
1138
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1139
0
if (newcode == NULL) return NULL;
1140
0
memcpy(newcode, code, code->blocksize);
1141
0
newcode->executable_jit = NULL;
1142
1143
/* If the code is one that has been deserialized, increment the reference count
1144
in the decoded tables. */
1145
1146
0
if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1147
0
  {
1148
0
  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1149
0
  (*ref_count)++;
1150
0
  }
1151
1152
0
return newcode;
1153
0
}
1154
1155
1156
1157
/*************************************************
1158
*     Copy compiled code and character tables    *
1159
*************************************************/
1160
1161
/* Compiled JIT code cannot be copied, so the new compiled block has no
1162
associated JIT data. This version of code_copy also makes a separate copy of
1163
the character tables. */
1164
1165
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
1166
pcre2_code_copy_with_tables(const pcre2_code *code)
1167
0
{
1168
0
PCRE2_SIZE* ref_count;
1169
0
pcre2_code *newcode;
1170
0
uint8_t *newtables;
1171
1172
0
if (code == NULL) return NULL;
1173
0
newcode = code->memctl.malloc(code->blocksize, code->memctl.memory_data);
1174
0
if (newcode == NULL) return NULL;
1175
0
memcpy(newcode, code, code->blocksize);
1176
0
newcode->executable_jit = NULL;
1177
1178
0
newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),
1179
0
  code->memctl.memory_data);
1180
0
if (newtables == NULL)
1181
0
  {
1182
0
  code->memctl.free((void *)newcode, code->memctl.memory_data);
1183
0
  return NULL;
1184
0
  }
1185
0
memcpy(newtables, code->tables, TABLES_LENGTH);
1186
0
ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);
1187
0
*ref_count = 1;
1188
1189
0
newcode->tables = newtables;
1190
0
newcode->flags |= PCRE2_DEREF_TABLES;
1191
0
return newcode;
1192
0
}
1193
1194
1195
1196
/*************************************************
1197
*               Free compiled code               *
1198
*************************************************/
1199
1200
PCRE2_EXP_DEFN void PCRE2_CALL_CONVENTION
1201
pcre2_code_free(pcre2_code *code)
1202
16.1k
{
1203
16.1k
PCRE2_SIZE* ref_count;
1204
1205
16.1k
if (code != NULL)
1206
6.24k
  {
1207
#ifdef SUPPORT_JIT
1208
  if (code->executable_jit != NULL)
1209
    PRIV(jit_free)(code->executable_jit, &code->memctl);
1210
#endif
1211
1212
6.24k
  if ((code->flags & PCRE2_DEREF_TABLES) != 0)
1213
0
    {
1214
    /* Decoded tables belong to the codes after deserialization, and they must
1215
    be freed when there are no more references to them. The *ref_count should
1216
    always be > 0. */
1217
1218
0
    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);
1219
0
    if (*ref_count > 0)
1220
0
      {
1221
0
      (*ref_count)--;
1222
0
      if (*ref_count == 0)
1223
0
        code->memctl.free((void *)code->tables, code->memctl.memory_data);
1224
0
      }
1225
0
    }
1226
1227
6.24k
  code->memctl.free(code, code->memctl.memory_data);
1228
6.24k
  }
1229
16.1k
}
1230
1231
1232
1233
/*************************************************
1234
*         Read a number, possibly signed         *
1235
*************************************************/
1236
1237
/* This function is used to read numbers in the pattern. The initial pointer
1238
must be at the sign or first digit of the number. When relative values
1239
(introduced by + or -) are allowed, they are relative group numbers, and the
1240
result must be greater than zero.
1241
1242
Arguments:
1243
  ptrptr      points to the character pointer variable
1244
  ptrend      points to the end of the input string
1245
  allow_sign  if < 0, sign not allowed; if >= 0, sign is relative to this
1246
  max_value   the largest number allowed;
1247
              you must not pass a value for max_value larger than
1248
              INT_MAX/10 - 1 because this function relies on max_value to
1249
              avoid integer overflow
1250
  max_error   the error to give for an over-large number
1251
  intptr      where to put the result
1252
  errcodeptr  where to put an error code
1253
1254
Returns:      TRUE  - a number was read
1255
              FALSE - errorcode == 0 => no number was found
1256
                      errorcode != 0 => an error occurred
1257
*/
1258
1259
static BOOL
1260
read_number(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, int32_t allow_sign,
1261
  uint32_t max_value, uint32_t max_error, int *intptr, int *errorcodeptr)
1262
43.2k
{
1263
43.2k
int sign = 0;
1264
43.2k
uint32_t n = 0;
1265
43.2k
PCRE2_SPTR ptr = *ptrptr;
1266
43.2k
BOOL yield = FALSE;
1267
1268
43.2k
PCRE2_ASSERT(max_value <= INT_MAX/10 - 1);
1269
1270
43.2k
*errorcodeptr = 0;
1271
1272
43.2k
if (allow_sign >= 0 && ptr < ptrend)
1273
12.3k
  {
1274
12.3k
  if (*ptr == CHAR_PLUS)
1275
4.88k
    {
1276
4.88k
    sign = +1;
1277
4.88k
    max_value -= allow_sign;
1278
4.88k
    ptr++;
1279
4.88k
    }
1280
7.42k
  else if (*ptr == CHAR_MINUS)
1281
1.03k
    {
1282
1.03k
    sign = -1;
1283
1.03k
    ptr++;
1284
1.03k
    }
1285
12.3k
  }
1286
1287
43.2k
if (ptr >= ptrend || !IS_DIGIT(*ptr)) return FALSE;
1288
88.7k
while (ptr < ptrend && IS_DIGIT(*ptr))
1289
52.1k
  {
1290
52.1k
  n = n * 10 + (*ptr++ - CHAR_0);
1291
52.1k
  if (n > max_value)
1292
367
    {
1293
367
    *errorcodeptr = max_error;
1294
2.16k
    while (ptr < ptrend && IS_DIGIT(*ptr)) ptr++;
1295
367
    goto EXIT;
1296
367
    }
1297
52.1k
  }
1298
1299
36.5k
if (allow_sign >= 0 && sign != 0)
1300
5.78k
  {
1301
5.78k
  if (n == 0)
1302
2
    {
1303
2
    *errorcodeptr = ERR26;  /* +0 and -0 are not allowed */
1304
2
    goto EXIT;
1305
2
    }
1306
1307
5.78k
  if (sign > 0) n += allow_sign;
1308
967
  else if (n > (uint32_t)allow_sign)
1309
46
    {
1310
46
    *errorcodeptr = ERR15;  /* Non-existent subpattern */
1311
46
    goto EXIT;
1312
46
    }
1313
921
  else n = allow_sign + 1 - n;
1314
5.78k
  }
1315
1316
36.4k
yield = TRUE;
1317
1318
36.9k
EXIT:
1319
36.9k
*intptr = n;
1320
36.9k
*ptrptr = ptr;
1321
36.9k
return yield;
1322
36.4k
}
1323
1324
1325
1326
/*************************************************
1327
*         Read repeat counts                     *
1328
*************************************************/
1329
1330
/* Read an item of the form {n,m} and return the values when non-NULL pointers
1331
are supplied. Repeat counts must be less than 65536 (MAX_REPEAT_COUNT); a
1332
larger value is used for "unlimited". We have to use signed arguments for
1333
read_number() because it is capable of returning a signed value. As of Perl
1334
5.34.0 either n or m may be absent, but not both. Perl also allows spaces and
1335
tabs after { and before } and between the numbers and the comma, so we do too.
1336
1337
Arguments:
1338
  ptrptr         points to pointer to character after '{'
1339
  ptrend         pointer to end of input
1340
  minp           if not NULL, pointer to int for min
1341
  maxp           if not NULL, pointer to int for max
1342
  errorcodeptr   points to error code variable
1343
1344
Returns:         FALSE if not a repeat quantifier, errorcode set zero
1345
                 FALSE on error, with errorcode set non-zero
1346
                 TRUE on success, with pointer updated to point after '}'
1347
*/
1348
1349
static BOOL
1350
read_repeat_counts(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *minp,
1351
  uint32_t *maxp, int *errorcodeptr)
1352
27.5k
{
1353
27.5k
PCRE2_SPTR p = *ptrptr;
1354
27.5k
PCRE2_SPTR pp;
1355
27.5k
BOOL yield = FALSE;
1356
27.5k
BOOL had_minimum = FALSE;
1357
27.5k
int32_t min = 0;
1358
27.5k
int32_t max = REPEAT_UNLIMITED; /* This value is larger than MAX_REPEAT_COUNT */
1359
1360
27.5k
*errorcodeptr = 0;
1361
27.5k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1362
1363
/* Check the syntax before interpreting. Otherwise, a non-quantifier sequence
1364
such as "X{123456ABC" would incorrectly give a "number too big in quantifier"
1365
error. */
1366
1367
27.5k
pp = p;
1368
27.5k
if (pp < ptrend && IS_DIGIT(*pp))
1369
20.2k
  {
1370
20.2k
  had_minimum = TRUE;
1371
31.4k
  while (++pp < ptrend && IS_DIGIT(*pp)) {}
1372
20.2k
  }
1373
1374
27.6k
while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1375
27.5k
if (pp >= ptrend) return FALSE;
1376
1377
27.4k
if (*pp == CHAR_RIGHT_CURLY_BRACKET)
1378
12.1k
  {
1379
12.1k
  if (!had_minimum) return FALSE;
1380
12.1k
  }
1381
15.2k
else
1382
15.2k
  {
1383
15.2k
  if (*pp++ != CHAR_COMMA) return FALSE;
1384
6.94k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1385
6.94k
  if (pp >= ptrend) return FALSE;
1386
6.92k
  if (IS_DIGIT(*pp))
1387
4.55k
    {
1388
6.93k
    while (++pp < ptrend && IS_DIGIT(*pp)) {}
1389
4.55k
    }
1390
2.36k
  else if (!had_minimum) return FALSE;
1391
6.37k
  while (pp < ptrend && (*pp == CHAR_SPACE || *pp == CHAR_HT)) pp++;
1392
6.37k
  if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
1393
6.37k
  }
1394
1395
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
1396
or {n,m}. The only error that read_number() can return is for a number that is
1397
too big. If *errorcodeptr is returned as zero it means no number was found. */
1398
1399
/* Deal with {,m} or n too big. If we successfully read m there is no need to
1400
check m >= n because n defaults to zero. */
1401
1402
17.4k
if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &min, errorcodeptr))
1403
14
  {
1404
14
  if (*errorcodeptr != 0) goto EXIT;    /* n too big */
1405
12
  p++;  /* Skip comma and subsequent spaces */
1406
12
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1407
12
  if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1408
0
    {
1409
0
    if (*errorcodeptr != 0) goto EXIT;  /* m too big */
1410
0
    }
1411
12
  }
1412
1413
/* Have read one number. Deal with {n} or {n,} or {n,m} */
1414
1415
17.4k
else
1416
17.4k
  {
1417
17.4k
  while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1418
17.4k
  if (*p == CHAR_RIGHT_CURLY_BRACKET)
1419
12.0k
    {
1420
12.0k
    max = min;
1421
12.0k
    }
1422
5.39k
  else   /* Handle {n,} or {n,m} */
1423
5.39k
    {
1424
5.39k
    p++;    /* Skip comma and subsequent spaces */
1425
5.39k
    while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1426
5.39k
    if (!read_number(&p, ptrend, -1, MAX_REPEAT_COUNT, ERR5, &max, errorcodeptr))
1427
1.21k
      {
1428
1.21k
      if (*errorcodeptr != 0) goto EXIT;   /* m too big */
1429
1.21k
      }
1430
1431
5.38k
    if (max < min)
1432
2
      {
1433
2
      *errorcodeptr = ERR4;
1434
2
      goto EXIT;
1435
2
      }
1436
5.38k
    }
1437
17.4k
  }
1438
1439
/* Valid quantifier exists */
1440
1441
17.4k
while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1442
17.4k
p++;
1443
17.4k
yield = TRUE;
1444
17.4k
if (minp != NULL) *minp = (uint32_t)min;
1445
17.4k
if (maxp != NULL) *maxp = (uint32_t)max;
1446
1447
/* Update the pattern pointer */
1448
1449
17.4k
EXIT:
1450
17.4k
*ptrptr = p;
1451
17.4k
return yield;
1452
17.4k
}
1453
1454
1455
1456
/*************************************************
1457
*            Handle escapes                      *
1458
*************************************************/
1459
1460
/* This function is called when a \ has been encountered. It either returns a
1461
positive value for a simple escape such as \d, or 0 for a data character, which
1462
is placed in chptr. A backreference to group n is returned as -(n+1). On
1463
entry, ptr is pointing at the character after \. On exit, it points after the
1464
final code unit of the escape sequence.
1465
1466
This function is also called from pcre2_substitute() to handle escape sequences
1467
in replacement strings. In this case, the cb argument is NULL, and in the case
1468
of escapes that have further processing, only sequences that define a data
1469
character are recognised. The options argument is the final value of the
1470
compiled pattern's options.
1471
1472
Arguments:
1473
  ptrptr         points to the input position pointer
1474
  ptrend         points to the end of the input
1475
  chptr          points to a returned data character
1476
  errorcodeptr   points to the errorcode variable (containing zero)
1477
  options        the current options bits
1478
  xoptions       the current extra options bits
1479
  bracount       the number of capturing parentheses encountered so far
1480
  isclass        TRUE if in a character class
1481
  cb             compile data block or NULL when called from pcre2_substitute()
1482
1483
Returns:         zero => a data character
1484
                 positive => a special escape sequence
1485
                 negative => a numerical back reference
1486
                 on error, errorcodeptr is set non-zero
1487
*/
1488
1489
int
1490
PRIV(check_escape)(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, uint32_t *chptr,
1491
  int *errorcodeptr, uint32_t options, uint32_t xoptions, uint32_t bracount,
1492
  BOOL isclass, compile_block *cb)
1493
24.6k
{
1494
24.6k
BOOL utf = (options & PCRE2_UTF) != 0;
1495
24.6k
BOOL alt_bsux =
1496
24.6k
  ((options & PCRE2_ALT_BSUX) | (xoptions & PCRE2_EXTRA_ALT_BSUX)) != 0;
1497
24.6k
PCRE2_SPTR ptr = *ptrptr;
1498
24.6k
uint32_t c, cc;
1499
24.6k
int escape = 0;
1500
24.6k
int i;
1501
1502
/* If backslash is at the end of the string, it's an error. */
1503
1504
24.6k
if (ptr >= ptrend)
1505
33
  {
1506
33
  *errorcodeptr = ERR1;
1507
33
  return 0;
1508
33
  }
1509
1510
24.6k
GETCHARINCTEST(c, ptr);         /* Get character value, increment pointer */
1511
24.6k
*errorcodeptr = 0;              /* Be optimistic */
1512
1513
/* Non-alphanumerics are literals, so we just leave the value in c. An initial
1514
value test saves a memory lookup for code points outside the alphanumeric
1515
range. */
1516
1517
24.6k
if (c < ESCAPES_FIRST || c > ESCAPES_LAST) {}  /* Definitely literal */
1518
1519
/* Otherwise, do a table lookup. Non-zero values need little processing here. A
1520
positive value is a literal value for something like \n. A negative value is
1521
the negation of one of the ESC_ macros that is passed back for handling by the
1522
calling function. Some extra checking is needed for \N because only \N{U+dddd}
1523
is supported. If the value is zero, further processing is handled below. */
1524
1525
23.3k
else if ((i = escapes[c - ESCAPES_FIRST]) != 0)
1526
11.5k
  {
1527
11.5k
  if (i > 0)
1528
792
    {
1529
792
    c = (uint32_t)i;
1530
792
    if (c == CHAR_CR && (xoptions & PCRE2_EXTRA_ESCAPED_CR_IS_LF) != 0)
1531
0
      c = CHAR_LF;
1532
792
    }
1533
10.7k
  else  /* Negative table entry */
1534
10.7k
    {
1535
10.7k
    escape = -i;                    /* Else return a special escape */
1536
10.7k
    if (cb != NULL && (escape == ESC_P || escape == ESC_p || escape == ESC_X))
1537
8
      cb->external_flags |= PCRE2_HASBKPORX;   /* Note \P, \p, or \X */
1538
1539
    /* Perl supports \N{name} for character names and \N{U+dddd} for numerical
1540
    Unicode code points, as well as plain \N for "not newline". PCRE does not
1541
    support \N{name}. However, it does support quantification such as \N{2,3},
1542
    so if \N{ is not followed by U+dddd we check for a quantifier. */
1543
1544
10.7k
    if (escape == ESC_N && ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
1545
202
      {
1546
202
      PCRE2_SPTR p = ptr + 1;
1547
1548
      /* Perl ignores spaces and tabs after { */
1549
1550
202
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1551
1552
      /* \N{U+ can be handled by the \x{ code. However, this construction is
1553
      not valid in EBCDIC environments because it specifies a Unicode
1554
      character, not a codepoint in the local code. For example \N{U+0041}
1555
      must be "A" in all environments. Also, in Perl, \N{U+ forces Unicode
1556
      casing semantics for the entire pattern, so allow it only in UTF (i.e.
1557
      Unicode) mode. */
1558
1559
202
      if (ptrend - p > 1 && *p == CHAR_U && p[1] == CHAR_PLUS)
1560
0
        {
1561
0
#ifndef EBCDIC
1562
0
        if (utf)
1563
0
          {
1564
0
          ptr = p + 2;
1565
0
          escape = 0;   /* Not a fancy escape after all */
1566
0
          goto COME_FROM_NU;
1567
0
          }
1568
0
#endif
1569
1570
        /* Improve error offset. */
1571
0
        ptr = p + 2;
1572
0
        while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
1573
0
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1574
0
        if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET) ptr++;
1575
1576
0
        *errorcodeptr = ERR93;
1577
0
        }
1578
1579
      /* Give an error in contexts where quantifiers are not allowed
1580
      (character classes; substitution strings). */
1581
1582
202
      else if (isclass || cb == NULL)
1583
0
        {
1584
0
        ptr++; /* Skip over the opening brace */
1585
0
        *errorcodeptr = ERR37;
1586
0
        }
1587
1588
      /* Give an error if what follows is not a quantifier, but don't override
1589
      an error set by the quantifier reader (e.g. number overflow). */
1590
1591
202
      else
1592
202
        {
1593
202
        if (!read_repeat_counts(&p, ptrend, NULL, NULL, errorcodeptr) &&
1594
1
             *errorcodeptr == 0)
1595
1
          {
1596
1
          ptr++; /* Skip over the opening brace */
1597
1
          *errorcodeptr = ERR37;
1598
1
          }
1599
202
        }
1600
202
      }
1601
10.7k
    }
1602
11.5k
  }
1603
1604
/* Escapes that need further processing, including those that are unknown, have
1605
a zero entry in the lookup table. When called from pcre2_substitute(), only \c,
1606
\o, and \x are recognized (\u and \U can never appear as they are used for case
1607
forcing). */
1608
1609
11.8k
else
1610
11.8k
  {
1611
11.8k
  int s;
1612
11.8k
  PCRE2_SPTR oldptr;
1613
11.8k
  BOOL overflow;
1614
1615
  /* Filter calls from pcre2_substitute(). */
1616
1617
11.8k
  if (cb == NULL)
1618
0
    {
1619
0
    if (!(c >= CHAR_0 && c <= CHAR_9) && c != CHAR_c && c != CHAR_o &&
1620
0
        c != CHAR_x && c != CHAR_g)
1621
0
      {
1622
0
      *errorcodeptr = ERR3;
1623
0
      goto EXIT;
1624
0
      }
1625
0
    alt_bsux = FALSE;   /* Do not modify \x handling */
1626
0
    }
1627
1628
11.8k
  switch (c)
1629
11.8k
    {
1630
    /* A number of Perl escapes are not handled by PCRE. We give an explicit
1631
    error. */
1632
1633
2
    case CHAR_F:
1634
3
    case CHAR_l:
1635
5
    case CHAR_L:
1636
5
    *errorcodeptr = ERR37;
1637
5
    break;
1638
1639
    /* \u is unrecognized when neither PCRE2_ALT_BSUX nor PCRE2_EXTRA_ALT_BSUX
1640
    is set. Otherwise, \u must be followed by exactly four hex digits or, if
1641
    PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
1642
    Otherwise it is a lowercase u letter. This gives some compatibility with
1643
    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
1644
    allowed. When \u{ is not followed by hex digits, a special return is given
1645
    because otherwise \u{ 12} (for example) would be treated as u{12}. */
1646
1647
1
    case CHAR_u:
1648
1
    if (!alt_bsux)
1649
1
      *errorcodeptr = ERR37;
1650
0
    else
1651
0
      {
1652
0
      uint32_t xc;
1653
1654
0
      if (ptr >= ptrend) break;
1655
0
      if (*ptr == CHAR_LEFT_CURLY_BRACKET &&
1656
0
          (xoptions & PCRE2_EXTRA_ALT_BSUX) != 0)
1657
0
        {
1658
0
        PCRE2_SPTR hptr = ptr + 1;
1659
1660
0
        cc = 0;
1661
0
        while (hptr < ptrend && (xc = XDIGIT(*hptr)) != 0xff)
1662
0
          {
1663
0
          if ((cc & 0xf0000000) != 0)  /* Test for 32-bit overflow */
1664
0
            {
1665
0
            *errorcodeptr = ERR77;
1666
0
            ptr = hptr;   /* Show where */
1667
0
            break;        /* *hptr != } will cause another break below */
1668
0
            }
1669
0
          cc = (cc << 4) | xc;
1670
0
          hptr++;
1671
0
          }
1672
1673
0
        if (hptr == ptr + 1 ||   /* No hex digits */
1674
0
            hptr >= ptrend ||    /* Hit end of input */
1675
0
            *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
1676
0
          {
1677
0
          if (isclass) break; /* In a class, just treat as '\u' literal */
1678
0
          escape = ESC_ub;    /* Special return */
1679
0
          ptr++;              /* Skip { */
1680
0
          break;              /* Hex escape not recognized */
1681
0
          }
1682
1683
0
        c = cc;          /* Accept the code point */
1684
0
        ptr = hptr + 1;
1685
0
        }
1686
1687
0
      else  /* Must be exactly 4 hex digits */
1688
0
        {
1689
0
        if (ptrend - ptr < 4) break;               /* Less than 4 chars */
1690
0
        if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
1691
0
        if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
1692
0
        cc = (cc << 4) | xc;
1693
0
        if ((xc = XDIGIT(ptr[2])) == 0xff) break;  /* Not a hex digit */
1694
0
        cc = (cc << 4) | xc;
1695
0
        if ((xc = XDIGIT(ptr[3])) == 0xff) break;  /* Not a hex digit */
1696
0
        c = (cc << 4) | xc;
1697
0
        ptr += 4;
1698
0
        }
1699
1700
0
      if (utf)
1701
0
        {
1702
0
        if (c > 0x10ffffU) *errorcodeptr = ERR77;
1703
0
        else
1704
0
          if (c >= 0xd800 && c <= 0xdfff &&
1705
0
              (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
1706
0
                *errorcodeptr = ERR73;
1707
0
        }
1708
0
      else if (c > MAX_NON_UTF_CHAR) *errorcodeptr = ERR77;
1709
0
      }
1710
1
    break;
1711
1712
    /* \U is unrecognized unless PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set,
1713
    in which case it is an upper case letter. */
1714
1715
1
    case CHAR_U:
1716
1
    if (!alt_bsux) *errorcodeptr = ERR37;
1717
1
    break;
1718
1719
    /* In a character class, \g is just a literal "g". Outside a character
1720
    class, \g must be followed by one of a number of specific things:
1721
1722
    (1) A number, either plain or braced. If positive, it is an absolute
1723
    backreference. If negative, it is a relative backreference. This is a Perl
1724
    5.10 feature.
1725
1726
    (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
1727
    is part of Perl's movement towards a unified syntax for back references. As
1728
    this is synonymous with \k{name}, we fudge it up by pretending it really
1729
    was \k{name}.
1730
1731
    (3) For Oniguruma compatibility we also support \g followed by a name or a
1732
    number either in angle brackets or in single quotes. However, these are
1733
    (possibly recursive) subroutine calls, _not_ backreferences. We return
1734
    the ESC_g code.
1735
1736
    Summary: Return a negative number for a numerical back reference (offset
1737
    by 1), ESC_k for a named back reference, and ESC_g for a named or
1738
    numbered subroutine call.
1739
1740
    The above describes the \g behaviour inside patterns. Inside replacement
1741
    strings (pcre2_substitute) we support only \g<nameornum> for Python
1742
    compatibility. Return ESG_g for the named case, and -(num+1) for the
1743
    numbered case.
1744
    */
1745
1746
6.48k
    case CHAR_g:
1747
6.48k
    if (isclass) break;
1748
1749
6.37k
    if (ptr >= ptrend)
1750
3
      {
1751
3
      *errorcodeptr = ERR57;
1752
3
      break;
1753
3
      }
1754
1755
6.37k
    if (cb == NULL)
1756
0
      {
1757
0
      PCRE2_SPTR p;
1758
      /* Substitution strings */
1759
0
      if (*ptr != CHAR_LESS_THAN_SIGN)
1760
0
        {
1761
0
        *errorcodeptr = ERR57;
1762
0
        break;
1763
0
        }
1764
1765
0
      p = ptr + 1;
1766
1767
0
      if (!read_number(&p, ptrend, -1, MAX_GROUP_NUMBER, ERR61, &s,
1768
0
          errorcodeptr))
1769
0
        {
1770
0
        if (*errorcodeptr == 0) escape = ESC_g;  /* No number found */
1771
0
        break;
1772
0
        }
1773
1774
0
      if (p >= ptrend || *p != CHAR_GREATER_THAN_SIGN)
1775
0
        {
1776
0
        ptr = p;
1777
0
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1778
0
        break;
1779
0
        }
1780
1781
      /* This is the reason that back references are returned as -(s+1) rather
1782
      than just -s. In a pattern, \0 is not a back reference, but \g<0> is
1783
      valid in a substitution string, so this must be representable. */
1784
0
      ptr = p + 1;
1785
0
      escape = -(s+1);
1786
0
      break;
1787
0
      }
1788
1789
6.37k
    if (*ptr == CHAR_LESS_THAN_SIGN || *ptr == CHAR_APOSTROPHE)
1790
5.10k
      {
1791
5.10k
      escape = ESC_g;
1792
5.10k
      break;
1793
5.10k
      }
1794
1795
    /* If there is a brace delimiter, try to read a numerical reference. If
1796
    there isn't one, assume we have a name and treat it as \k. */
1797
1798
1.26k
    if (*ptr == CHAR_LEFT_CURLY_BRACKET)
1799
430
      {
1800
430
      PCRE2_SPTR p = ptr + 1;
1801
1802
431
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1803
430
      if (!read_number(&p, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1804
430
          errorcodeptr))
1805
319
        {
1806
319
        if (*errorcodeptr == 0) escape = ESC_k;  /* No number found */
1807
319
        break;
1808
319
        }
1809
111
      while (p < ptrend && (*p == CHAR_SPACE || *p == CHAR_HT)) p++;
1810
1811
111
      if (p >= ptrend || *p != CHAR_RIGHT_CURLY_BRACKET)
1812
5
        {
1813
5
        ptr = p;
1814
5
        *errorcodeptr = ERR119;  /* Missing terminator for number */
1815
5
        break;
1816
5
        }
1817
106
      ptr = p + 1;
1818
106
      }
1819
1820
    /* Read an undelimited number */
1821
1822
839
    else
1823
839
      {
1824
839
      if (!read_number(&ptr, ptrend, bracount, MAX_GROUP_NUMBER, ERR61, &s,
1825
839
          errorcodeptr))
1826
159
        {
1827
159
        if (*errorcodeptr == 0) *errorcodeptr = ERR57;  /* No number found */
1828
159
        break;
1829
159
        }
1830
839
      }
1831
1832
786
    if (s <= 0)
1833
2
      {
1834
2
      *errorcodeptr = ERR15;
1835
2
      break;
1836
2
      }
1837
1838
784
    escape = -(s+1);
1839
784
    break;
1840
1841
    /* The handling of escape sequences consisting of a string of digits
1842
    starting with one that is not zero is not straightforward. Perl has changed
1843
    over the years. Nowadays \g{} for backreferences and \o{} for octal are
1844
    recommended to avoid the ambiguities in the old syntax.
1845
1846
    Outside a character class, the digits are read as a decimal number. If the
1847
    number is less than 10, or if there are that many previous extracting left
1848
    brackets, it is a back reference. Otherwise, up to three octal digits are
1849
    read to form an escaped character code. Thus \123 is likely to be octal 123
1850
    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
1851
    style" of handling ambiguous octal/backrefences such as \12.
1852
1853
    There is an alternative disambiguation strategy, selected by
1854
    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
1855
    have either a leading zero, or exactly three octal digits; otherwise it's
1856
    a backreference. The disambiguation is stable, and does not depend on how
1857
    many capture groups are defined (it's simply an invalid backreference if
1858
    there is no corresponding capture group). Additionally, octal values above
1859
    \377 (\xff) are rejected.
1860
1861
    Inside a character class, \ followed by a digit is always either a literal
1862
    8 or 9 or an octal number. */
1863
1864
2.65k
    case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
1865
3.59k
    case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
1866
1867
3.59k
    if (isclass)
1868
197
      {
1869
      /* Fall through to octal handling; never a backreference inside a class. */
1870
197
      }
1871
3.39k
    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
1872
0
      {
1873
      /* Python-style disambiguation. */
1874
0
      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
1875
0
          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1876
0
        {
1877
        /* We peeked a three-digit octal, so fall through */
1878
0
        }
1879
0
      else
1880
0
        {
1881
        /* We are at a digit, so the only possible error from read_number() is
1882
        a number that is too large. */
1883
0
        ptr--;   /* Back to the digit */
1884
1885
0
        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1886
0
          {
1887
0
          *errorcodeptr = ERR61;
1888
0
          break;
1889
0
          }
1890
1891
0
        escape = -(s+1);
1892
0
        break;
1893
0
        }
1894
0
      }
1895
3.39k
    else
1896
3.39k
      {
1897
      /* Perl-style disambiguation. */
1898
3.39k
      oldptr = ptr;
1899
3.39k
      ptr--;   /* Back to the digit */
1900
1901
      /* As we know we are at a digit, the only possible error from
1902
      read_number() is a number that is too large to be a group number. Because
1903
      that number might be still valid if read as an octal, errorcodeptr is not
1904
      set on failure and therefore a sentinel value of INT_MAX is used instead
1905
      of the original value, and will be used later to properly set the error,
1906
      if not falling through. */
1907
1908
3.39k
      if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
1909
63
        s = INT_MAX;
1910
1911
      /* \1 to \9 are always back references. \8x and \9x are too; \1x to \7x
1912
      are octal escapes if there are not that many previous captures. */
1913
1914
3.39k
      if (s < 10 || c >= CHAR_8 || (unsigned)s <= bracount)
1915
3.06k
        {
1916
        /* s > MAX_GROUP_NUMBER should not be possible because of read_number(),
1917
        but we keep it just to be safe and because it will also catch the
1918
        sentinel value that was set on failure by that function. */
1919
1920
3.06k
        if ((unsigned)s > MAX_GROUP_NUMBER)
1921
12
          {
1922
12
          PCRE2_ASSERT(s == INT_MAX);
1923
12
          *errorcodeptr = ERR61;
1924
12
          }
1925
3.05k
        else escape = -(s+1);     /* Indicates a back reference */
1926
3.06k
        break;
1927
3.06k
        }
1928
1929
333
      ptr = oldptr;      /* Put the pointer back and fall through */
1930
333
      }
1931
1932
    /* Handle a digit following \ when the number is not a back reference, or
1933
    we are within a character class. If the first digit is 8 or 9, Perl used to
1934
    generate a binary zero and then treat the digit as a following literal. At
1935
    least by Perl 5.18 this changed so as not to insert the binary zero. */
1936
1937
530
    if (c >= CHAR_8) break;
1938
1939
504
    PCRE2_FALLTHROUGH /* Fall through */
1940
504
1941
504
    /* \0 always starts an octal number, but we may drop through to here with a
1942
504
    larger first octal digit. The original code used just to take the least
1943
504
    significant 8 bits of octal numbers (I think this is what early Perls used
1944
504
    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
1945
504
    but no more than 3 octal digits. */
1946
504
1947
702
    case CHAR_0:
1948
702
    c -= CHAR_0;
1949
1.33k
    while(i++ < 2 && ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1950
630
        c = c * 8 + *ptr++ - CHAR_0;
1951
702
    if (c > 0xff)
1952
21
      {
1953
21
      if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) *errorcodeptr = ERR102;
1954
21
#if PCRE2_CODE_UNIT_WIDTH == 8
1955
21
      else if (!utf) *errorcodeptr = ERR51;
1956
21
#endif
1957
21
      }
1958
1959
    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
1960
    two- or three-character octal escapes \00 and \000, nor \x00. */
1961
1962
702
    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
1963
0
        *errorcodeptr = ERR98;
1964
702
    break;
1965
1966
    /* \o is a relatively new Perl feature, supporting a more general way of
1967
    specifying character codes in octal. The only supported form is \o{ddd},
1968
    with optional spaces or tabs after { and before }. */
1969
1970
256
    case CHAR_o:
1971
256
    if (ptr >= ptrend || *ptr != CHAR_LEFT_CURLY_BRACKET)
1972
9
      {
1973
9
      *errorcodeptr = ERR55;
1974
9
      break;
1975
9
      }
1976
247
    ptr++;
1977
1978
248
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
1979
247
    if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
1980
3
      {
1981
3
      *errorcodeptr = ERR78;
1982
3
      break;
1983
3
      }
1984
1985
244
    c = 0;
1986
244
    overflow = FALSE;
1987
816
    while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7)
1988
588
      {
1989
588
      cc = *ptr++;
1990
588
      if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1991
#if PCRE2_CODE_UNIT_WIDTH == 32
1992
      if (c >= 0x20000000u) { overflow = TRUE; break; }
1993
#endif
1994
406
      c = (c << 3) + (cc - CHAR_0);
1995
406
#if PCRE2_CODE_UNIT_WIDTH == 8
1996
406
      if (c > (utf ? 0x10ffffU : 0xffU)) { overflow = TRUE; break; }
1997
#elif PCRE2_CODE_UNIT_WIDTH == 16
1998
      if (c > (utf ? 0x10ffffU : 0xffffU)) { overflow = TRUE; break; }
1999
#elif PCRE2_CODE_UNIT_WIDTH == 32
2000
      if (utf && c > 0x10ffffU) { overflow = TRUE; break; }
2001
#endif
2002
406
      }
2003
2004
245
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2005
2006
244
    if (overflow)
2007
16
      {
2008
212
      while (ptr < ptrend && *ptr >= CHAR_0 && *ptr <= CHAR_7) ptr++;
2009
16
      *errorcodeptr = ERR34;
2010
16
      }
2011
228
    else if (utf && c >= 0xd800 && c <= 0xdfff &&
2012
0
             (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2013
0
      {
2014
0
      *errorcodeptr = ERR73;
2015
0
      }
2016
228
    else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2017
208
      {
2018
208
      ptr++;
2019
208
      }
2020
20
    else
2021
20
      {
2022
20
      *errorcodeptr = ERR64;
2023
20
      goto ESCAPE_FAILED_FORWARD;
2024
20
      }
2025
224
    break;
2026
2027
    /* When PCRE2_ALT_BSUX or PCRE2_EXTRA_ALT_BSUX is set, \x must be followed
2028
    by two hexadecimal digits. Otherwise it is a lowercase x letter. */
2029
2030
969
    case CHAR_x:
2031
969
    if (alt_bsux)
2032
0
      {
2033
0
      uint32_t xc;
2034
0
      if (ptrend - ptr < 2) break;               /* Less than 2 characters */
2035
0
      if ((cc = XDIGIT(ptr[0])) == 0xff) break;  /* Not a hex digit */
2036
0
      if ((xc = XDIGIT(ptr[1])) == 0xff) break;  /* Not a hex digit */
2037
0
      c = (cc << 4) | xc;
2038
0
      ptr += 2;
2039
0
      }
2040
2041
    /* Handle \x in Perl's style. \x{ddd} is a character code which can be
2042
    greater than 0xff in UTF-8 or non-8bit mode, but only if the ddd are hex
2043
    digits. If not, { used to be treated as a data character. However, Perl
2044
    seems to read hex digits up to the first non-such, and ignore the rest, so
2045
    that, for example \x{zz} matches a binary zero. This seems crazy, so PCRE
2046
    now gives an error. */
2047
2048
969
    else
2049
969
      {
2050
969
      if (ptr < ptrend && *ptr == CHAR_LEFT_CURLY_BRACKET)
2051
207
        {
2052
207
        ptr++;
2053
207
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2054
2055
207
#ifndef EBCDIC
2056
207
        COME_FROM_NU:
2057
207
#endif
2058
207
        if (ptr >= ptrend || *ptr == CHAR_RIGHT_CURLY_BRACKET)
2059
3
          {
2060
3
          *errorcodeptr = ERR78;
2061
3
          break;
2062
3
          }
2063
204
        c = 0;
2064
204
        overflow = FALSE;
2065
2066
645
        while (ptr < ptrend && (cc = XDIGIT(*ptr)) != 0xff)
2067
459
          {
2068
459
          ptr++;
2069
459
          if (c == 0 && cc == 0) continue;   /* Leading zeroes */
2070
#if PCRE2_CODE_UNIT_WIDTH == 32
2071
          if (c >= 0x10000000l) { overflow = TRUE; break; }
2072
#endif
2073
360
          c = (c << 4) | cc;
2074
360
          if ((utf && c > 0x10ffffU) || (!utf && c > MAX_NON_UTF_CHAR))
2075
18
            {
2076
18
            overflow = TRUE;
2077
18
            break;
2078
18
            }
2079
360
          }
2080
2081
        /* Perl ignores spaces and tabs before } */
2082
2083
204
        while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2084
2085
        /* On overflow, skip remaining hex digits */
2086
2087
204
        if (overflow)
2088
18
          {
2089
213
          while (ptr < ptrend && XDIGIT(*ptr) != 0xff) ptr++;
2090
18
          *errorcodeptr = ERR34;
2091
18
          }
2092
186
        else if (utf && c >= 0xd800 && c <= 0xdfff &&
2093
0
                 (xoptions & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) == 0)
2094
0
          {
2095
0
          *errorcodeptr = ERR73;
2096
0
          }
2097
186
        else if (ptr < ptrend && *ptr == CHAR_RIGHT_CURLY_BRACKET)
2098
167
          {
2099
167
          ptr++;
2100
167
          }
2101
2102
        /* If the sequence of hex digits (followed by optional space) does not
2103
        end with '}', give an error. We used just to recognize this construct
2104
        and fall through to the normal \x handling, but nowadays Perl gives an
2105
        error, which seems much more sensible, so we do too. */
2106
2107
19
        else
2108
19
          {
2109
19
          *errorcodeptr = ERR67;
2110
19
          goto ESCAPE_FAILED_FORWARD;
2111
19
          }
2112
204
        }   /* End of \x{} processing */
2113
2114
      /* Read a up to two hex digits after \x */
2115
2116
762
      else
2117
762
        {
2118
        /* Perl has the surprising/broken behaviour that \x without following
2119
        hex digits is treated as an escape for NUL. Their source code laments
2120
        this but keeps it for backwards compatibility. A warning is printed
2121
        when "use warnings" is enabled. Because we don't have warnings, we
2122
        simply forbid it. */
2123
762
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff)
2124
20
          {
2125
          /* Not a hex digit */
2126
20
          *errorcodeptr = ERR78;
2127
20
          break;
2128
20
          }
2129
742
        ptr++;
2130
742
        c = cc;
2131
2132
        /* With "use re 'strict'" Perl actually requires exactly two digits (error
2133
        for \x, \xA and \xAAA). While \x was already rejected, this seems overly
2134
        strict, and there seems little incentive to align with that, given the
2135
        backwards-compatibility cost.
2136
2137
        For comparison, note that other engines disagree. For example:
2138
          - Java allows 1 or 2 hex digits. Error if 0 digits. No error if >2 digits
2139
          - .NET requires 2 hex digits. Error if 0, 1 digits. No error if >2 digits.
2140
        */
2141
742
        if (ptr >= ptrend || (cc = XDIGIT(*ptr)) == 0xff) break;  /* Not a hex digit */
2142
201
        ptr++;
2143
201
        c = (c << 4) | cc;
2144
201
        }     /* End of \xdd handling */
2145
969
      }       /* End of Perl-style \x handling */
2146
386
    break;
2147
2148
    /* The handling of \c is different in ASCII and EBCDIC environments. In an
2149
    ASCII (or Unicode) environment, an error is given if the character
2150
    following \c is not a printable ASCII character. Otherwise, the following
2151
    character is upper-cased if it is a letter, and after that the 0x40 bit is
2152
    flipped. The result is the value of the escape.
2153
2154
    In an EBCDIC environment the handling of \c is compatible with the
2155
    specification in the perlebcdic document. The following character must be
2156
    a letter or one of small number of special characters. These provide a
2157
    means of defining the character values 0-31.
2158
2159
    For testing the EBCDIC handling of \c in an ASCII environment, recognize
2160
    the EBCDIC value of 'c' explicitly. */
2161
2162
386
    case CHAR_c:
2163
336
    if (ptr >= ptrend)
2164
1
      {
2165
1
      *errorcodeptr = ERR2;
2166
1
      break;
2167
1
      }
2168
335
    c = *ptr;
2169
335
    if (c >= CHAR_a && c <= CHAR_z) c = UPPER_CASE(c);
2170
2171
    /* Handle \c in an ASCII/Unicode environment. */
2172
2173
335
#ifndef EBCDIC    /* ASCII/UTF-8 coding */
2174
335
    if (c < 32 || c > 126)  /* Excludes all non-printable ASCII */
2175
8
      {
2176
8
      *errorcodeptr = ERR68;
2177
8
      goto ESCAPE_FAILED_FORWARD;
2178
8
      }
2179
327
    c ^= 0x40;
2180
2181
    /* Handle \c in an EBCDIC environment. The special case \c? is converted to
2182
    255 (0xff) or 95 (0x5f) if other characters suggest we are using the
2183
    POSIX-BC encoding. (This is the way Perl indicates that it handles \c?.)
2184
    The other valid sequences correspond to a list of specific characters. */
2185
2186
#else
2187
    if (c == CHAR_QUESTION_MARK)
2188
      c = (CHAR_BACKSLASH == 188 && CHAR_GRAVE_ACCENT == 74)? 0x5f : 0xff;
2189
    else
2190
      {
2191
      for (i = 0; i < 32; i++)
2192
        {
2193
        if (c == ebcdic_escape_c[i]) break;
2194
        }
2195
      if (i < 32)
2196
        c = i;
2197
      else
2198
        {
2199
        *errorcodeptr = ERR68;
2200
        goto ESCAPE_FAILED_FORWARD;
2201
        }
2202
      }
2203
#endif  /* EBCDIC */
2204
2205
327
    ptr++;
2206
327
    break;
2207
2208
    /* Any other alphanumeric following \ is an error. Perl gives an error only
2209
    if in warning mode, but PCRE doesn't have a warning mode. */
2210
2211
39
    default:
2212
39
    *errorcodeptr = ERR3;
2213
39
    break;
2214
11.8k
    }
2215
11.8k
  }
2216
2217
/* Set the pointer to the next character before returning. */
2218
2219
24.6k
EXIT:
2220
24.6k
*ptrptr = ptr;
2221
24.6k
*chptr = c;
2222
24.6k
return escape;
2223
2224
/* Some errors need to indicate the next character. */
2225
2226
47
ESCAPE_FAILED_FORWARD:
2227
47
ptr++;
2228
47
#ifdef SUPPORT_UNICODE
2229
47
if (utf) FORWARDCHARTEST(ptr, ptrend);
2230
47
#endif
2231
47
goto EXIT;
2232
24.6k
}
2233
2234
2235
2236
#ifdef SUPPORT_UNICODE
2237
/*************************************************
2238
*               Handle \P and \p                 *
2239
*************************************************/
2240
2241
/* This function is called after \P or \p has been encountered, provided that
2242
PCRE2 is compiled with support for UTF and Unicode properties. On entry, the
2243
contents of ptrptr are pointing after the P or p. On exit, it is left pointing
2244
after the final code unit of the escape sequence.
2245
2246
Arguments:
2247
  ptrptr         the pattern position pointer
2248
  utf            true if the input is UTF-encoded
2249
  negptr         a boolean that is set TRUE for negation else FALSE
2250
  ptypeptr       an unsigned int that is set to the type value
2251
  pdataptr       an unsigned int that is set to the detailed property value
2252
  errorcodeptr   the error code variable
2253
  cb             the compile data
2254
2255
Returns:         TRUE if the type value was found, or FALSE for an invalid type
2256
*/
2257
2258
static BOOL
2259
get_ucp(PCRE2_SPTR *ptrptr, BOOL utf, BOOL *negptr, uint16_t *ptypeptr,
2260
  uint16_t *pdataptr, int *errorcodeptr, compile_block *cb)
2261
6
{
2262
6
uint32_t c;
2263
6
ptrdiff_t i;
2264
6
PCRE2_SIZE bot, top;
2265
6
PCRE2_SPTR ptr = *ptrptr;
2266
6
PCRE2_UCHAR name[50];
2267
6
PCRE2_UCHAR *vptr = NULL;
2268
6
uint16_t ptscript = PT_NOTSCRIPT;
2269
2270
#ifndef MAYBE_UTF_MULTI
2271
(void)utf;  /* Avoid compiler warning */
2272
#endif
2273
2274
6
if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2275
0
GETCHARINCTEST(c, ptr);
2276
0
*negptr = FALSE;
2277
2278
/* \P or \p can be followed by a name in {}, optionally preceded by ^ for
2279
negation. We must be handling Unicode encoding here, though we may be compiling
2280
for UTF-8 input in an EBCDIC environment. (PCRE2 does not support both EBCDIC
2281
input and Unicode input in the same build.) In accordance with Unicode's "loose
2282
matching" rules, ASCII white space, hyphens, and underscores are ignored. We
2283
don't use isspace() or tolower() because (a) code points may be greater than
2284
255, and (b) they wouldn't work when compiling for Unicode in an EBCDIC
2285
environment. */
2286
2287
0
if (c == CHAR_LEFT_CURLY_BRACKET)
2288
0
  {
2289
0
  if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2290
2291
0
  for (i = 0; i < (int)(sizeof(name) / sizeof(PCRE2_UCHAR)) - 1; i++)
2292
0
    {
2293
0
    REDO:
2294
2295
0
    if (ptr >= cb->end_pattern) goto ERROR_RETURN;
2296
0
    GETCHARINCTEST(c, ptr);
2297
2298
    /* Skip ignorable Unicode characters. */
2299
2300
0
    if (c == CHAR_UNDERSCORE || c == CHAR_MINUS || c == CHAR_SPACE ||
2301
0
        (c >= CHAR_HT && c <= CHAR_CR))
2302
0
      {
2303
0
      goto REDO;
2304
0
      }
2305
2306
    /* The first significant character being circumflex negates the meaning of
2307
    the item. */
2308
2309
0
    if (i == 0 && !*negptr && c == CHAR_CIRCUMFLEX_ACCENT)
2310
0
      {
2311
0
      *negptr = TRUE;
2312
0
      goto REDO;
2313
0
      }
2314
2315
0
    if (c == CHAR_RIGHT_CURLY_BRACKET) break;
2316
2317
    /* Names consist of ASCII letters and digits, but equals and colon may also
2318
    occur as a name/value separator. We must also allow for \p{L&}. A simple
2319
    check for a value between '&' and 'z' suffices because anything else in a
2320
    name or value will cause an "unknown property" error anyway. */
2321
2322
0
    if (c < CHAR_AMPERSAND || c > CHAR_z) goto ERROR_RETURN;
2323
2324
    /* Lower case a capital letter or remember where the name/value separator
2325
    is. */
2326
2327
0
    if (c >= CHAR_A && c <= CHAR_Z) c |= 0x20;
2328
0
    else if ((c == CHAR_COLON || c == CHAR_EQUALS_SIGN) && vptr == NULL)
2329
0
      vptr = name + i;
2330
2331
0
    name[i] = c;
2332
0
    }
2333
2334
  /* Error if the loop didn't end with '}' - either we hit the end of the
2335
  pattern or the name was longer than any legal property name. */
2336
2337
0
  if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
2338
0
  name[i] = 0;
2339
0
  }
2340
2341
/* If { doesn't follow \p or \P there is just one following character, which
2342
must be an ASCII letter. */
2343
2344
0
else if (c >= CHAR_A && c <= CHAR_Z)
2345
0
  {
2346
0
  name[0] = c | 0x20;  /* Lower case */
2347
0
  name[1] = 0;
2348
0
  }
2349
0
else if (c >= CHAR_a && c <= CHAR_z)
2350
0
  {
2351
0
  name[0] = c;
2352
0
  name[1] = 0;
2353
0
  }
2354
0
else goto ERROR_RETURN;
2355
2356
0
*ptrptr = ptr;   /* Update pattern pointer */
2357
2358
/* If the property contains ':' or '=' we have class name and value separately
2359
specified. The following are supported:
2360
2361
  . Bidi_Class (synonym bc), for which the property names are "bidi<name>".
2362
  . Script (synonym sc) for which the property name is the script name
2363
  . Script_Extensions (synonym scx), ditto
2364
2365
As this is a small number, we currently just check the names directly. If this
2366
grows, a sorted table and a switch will be neater.
2367
2368
For both the script properties, set a PT_xxx value so that (1) they can be
2369
distinguished and (2) invalid script names that happen to be the name of
2370
another property can be diagnosed. */
2371
2372
0
if (vptr != NULL)
2373
0
  {
2374
0
  int offset = 0;
2375
0
  PCRE2_UCHAR sname[8];
2376
2377
0
  *vptr = 0;   /* Terminate property name */
2378
0
  if (PRIV(strcmp_c8)(name, STRING_bidiclass) == 0 ||
2379
0
      PRIV(strcmp_c8)(name, STRING_bc) == 0)
2380
0
    {
2381
0
    offset = 4;
2382
0
    sname[0] = CHAR_b;
2383
0
    sname[1] = CHAR_i;  /* There is no strcpy_c8 function */
2384
0
    sname[2] = CHAR_d;
2385
0
    sname[3] = CHAR_i;
2386
0
    }
2387
2388
0
  else if (PRIV(strcmp_c8)(name, STRING_script) == 0 ||
2389
0
           PRIV(strcmp_c8)(name, STRING_sc) == 0)
2390
0
    ptscript = PT_SC;
2391
2392
0
  else if (PRIV(strcmp_c8)(name, STRING_scriptextensions) == 0 ||
2393
0
           PRIV(strcmp_c8)(name, STRING_scx) == 0)
2394
0
    ptscript = PT_SCX;
2395
2396
0
  else
2397
0
    {
2398
0
    *errorcodeptr = ERR47;
2399
0
    return FALSE;
2400
0
    }
2401
2402
  /* Adjust the string in name[] as needed */
2403
2404
0
  memmove(name + offset, vptr + 1, (name + i - vptr)*sizeof(PCRE2_UCHAR));
2405
0
  if (offset != 0) memmove(name, sname, offset*sizeof(PCRE2_UCHAR));
2406
0
  }
2407
2408
/* Search for a recognized property using binary chop. */
2409
2410
0
bot = 0;
2411
0
top = PRIV(utt_size);
2412
2413
0
while (bot < top)
2414
0
  {
2415
0
  int r;
2416
0
  i = (bot + top) >> 1;
2417
0
  r = PRIV(strcmp_c8)(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
2418
2419
  /* When a matching property is found, some extra checking is needed when the
2420
  \p{xx:yy} syntax is used and xx is either sc or scx. */
2421
2422
0
  if (r == 0)
2423
0
    {
2424
0
    *pdataptr = PRIV(utt)[i].value;
2425
0
    if (vptr == NULL || ptscript == PT_NOTSCRIPT)
2426
0
      {
2427
0
      *ptypeptr = PRIV(utt)[i].type;
2428
0
      return TRUE;
2429
0
      }
2430
2431
0
    switch (PRIV(utt)[i].type)
2432
0
      {
2433
0
      case PT_SC:
2434
0
      *ptypeptr = PT_SC;
2435
0
      return TRUE;
2436
2437
0
      case PT_SCX:
2438
0
      *ptypeptr = ptscript;
2439
0
      return TRUE;
2440
0
      }
2441
2442
0
    break;  /* Non-script found */
2443
0
    }
2444
2445
0
  if (r > 0) bot = i + 1; else top = i;
2446
0
  }
2447
2448
0
*errorcodeptr = ERR47;   /* Unrecognized property */
2449
0
return FALSE;
2450
2451
6
ERROR_RETURN:            /* Malformed \P or \p */
2452
6
*errorcodeptr = ERR46;
2453
6
*ptrptr = ptr;
2454
6
return FALSE;
2455
0
}
2456
#endif
2457
2458
2459
2460
/*************************************************
2461
*           Check for POSIX class syntax         *
2462
*************************************************/
2463
2464
/* This function is called when the sequence "[:" or "[." or "[=" is
2465
encountered in a character class. It checks whether this is followed by a
2466
sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2467
reach an unescaped ']' without the special preceding character, return FALSE.
2468
2469
Originally, this function only recognized a sequence of letters between the
2470
terminators, but it seems that Perl recognizes any sequence of characters,
2471
though of course unknown POSIX names are subsequently rejected. Perl gives an
2472
"Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2473
didn't consider this to be a POSIX class. Likewise for [:1234:].
2474
2475
The problem in trying to be exactly like Perl is in the handling of escapes. We
2476
have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2477
class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2478
below handles the special cases \\ and \], but does not try to do any other
2479
escape processing. This makes it different from Perl for cases such as
2480
[:l\ower:] where Perl recognizes it as the POSIX class "lower" but PCRE does
2481
not recognize "l\ower". This is a lesser evil than not diagnosing bad classes
2482
when Perl does, I think.
2483
2484
A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2485
It seems that the appearance of a nested POSIX class supersedes an apparent
2486
external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2487
a digit. This is handled by returning FALSE if the start of a new group with
2488
the same terminator is encountered, since the next closing sequence must close
2489
the nested group, not the outer one.
2490
2491
In Perl, unescaped square brackets may also appear as part of class names. For
2492
example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2493
[:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2494
seem right at all. PCRE does not allow closing square brackets in POSIX class
2495
names.
2496
2497
Arguments:
2498
  ptr      pointer to the character after the initial [ (colon, dot, equals)
2499
  ptrend   pointer to the end of the pattern
2500
  endptr   where to return a pointer to the terminating ':', '.', or '='
2501
2502
Returns:   TRUE or FALSE
2503
*/
2504
2505
static BOOL
2506
check_posix_syntax(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, PCRE2_SPTR *endptr)
2507
6.00k
{
2508
6.00k
PCRE2_UCHAR terminator;  /* Don't combine these lines; the Solaris cc */
2509
6.00k
terminator = *ptr++;     /* compiler warns about "non-constant" initializer. */
2510
2511
1.88M
for (; ptrend - ptr >= 2; ptr++)
2512
1.88M
  {
2513
1.88M
  if (*ptr == CHAR_BACKSLASH &&
2514
60.7k
      (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET || ptr[1] == CHAR_BACKSLASH))
2515
892
    ptr++;
2516
2517
1.88M
  else if ((*ptr == CHAR_LEFT_SQUARE_BRACKET && ptr[1] == terminator) ||
2518
1.88M
            *ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2519
2520
1.88M
  else if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2521
4.41k
    {
2522
4.41k
    *endptr = ptr;
2523
4.41k
    return TRUE;
2524
4.41k
    }
2525
1.88M
  }
2526
2527
207
return FALSE;
2528
6.00k
}
2529
2530
2531
2532
/*************************************************
2533
*          Check POSIX class name                *
2534
*************************************************/
2535
2536
/* This function is called to check the name given in a POSIX-style class entry
2537
such as [:alnum:].
2538
2539
Arguments:
2540
  ptr        points to the first letter
2541
  len        the length of the name
2542
2543
Returns:     a value representing the name, or -1 if unknown
2544
*/
2545
2546
static int
2547
check_posix_name(PCRE2_SPTR ptr, int len)
2548
4.41k
{
2549
4.41k
const char *pn = posix_names;
2550
4.41k
int yield = 0;
2551
57.5k
while (posix_name_lengths[yield] != 0)
2552
57.4k
  {
2553
57.4k
  if (len == posix_name_lengths[yield] &&
2554
6.08k
    PRIV(strncmp_c8)(ptr, pn, (unsigned int)len) == 0) return yield;
2555
53.1k
  pn += posix_name_lengths[yield] + 1;
2556
53.1k
  yield++;
2557
53.1k
  }
2558
103
return -1;
2559
4.41k
}
2560
2561
2562
2563
/*************************************************
2564
*       Read a subpattern or VERB name           *
2565
*************************************************/
2566
2567
/* This function is called from parse_regex() below whenever it needs to read
2568
the name of a subpattern or a (*VERB) or an (*alpha_assertion). The initial
2569
pointer must be to the preceding character. If that character is '*' we are
2570
reading a verb or alpha assertion name. The pointer is updated to point after
2571
the name, for a VERB or alpha assertion name, or after the name's terminator
2572
for a subpattern name. Returning both the offset and the name pointer is
2573
redundant information, but some callers use one and some the other, so it is
2574
simplest just to return both. When the name is in braces, spaces and tabs are
2575
allowed (and ignored) at either end.
2576
2577
Arguments:
2578
  ptrptr      points to the character pointer variable
2579
  ptrend      points to the end of the input string
2580
  utf         true if the input is UTF-encoded
2581
  terminator  the terminator of a subpattern name must be this
2582
  offsetptr   where to put the offset from the start of the pattern
2583
  nameptr     where to put a pointer to the name in the input
2584
  namelenptr  where to put the length of the name
2585
  errcodeptr  where to put an error code
2586
  cb          pointer to the compile data block
2587
2588
Returns:    TRUE if a name was read
2589
            FALSE otherwise, with error code set
2590
*/
2591
2592
static BOOL
2593
read_name(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend, BOOL utf, uint32_t terminator,
2594
  PCRE2_SIZE *offsetptr, PCRE2_SPTR *nameptr, uint32_t *namelenptr,
2595
  int *errorcodeptr, compile_block *cb)
2596
19.7k
{
2597
19.7k
PCRE2_SPTR ptr = *ptrptr;
2598
19.7k
BOOL is_group = (*ptr++ != CHAR_ASTERISK);
2599
19.7k
BOOL is_braced = terminator == CHAR_RIGHT_CURLY_BRACKET;
2600
2601
19.7k
if (is_braced)
2602
320
  while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2603
2604
19.7k
if (ptr >= ptrend)                 /* No characters in name */
2605
18
  {
2606
18
  *errorcodeptr = is_group? ERR62: /* Subpattern name expected */
2607
18
                            ERR60; /* Verb not recognized or malformed */
2608
18
  goto FAILED;
2609
18
  }
2610
2611
19.7k
*nameptr = ptr;
2612
19.7k
*offsetptr = (PCRE2_SIZE)(ptr - cb->start_pattern);
2613
2614
/* If this logic were ever to change, the matching function in pcre2_substitute.c
2615
ought to be updated to match. */
2616
2617
/* In UTF mode, a group name may contain letters and decimal digits as defined
2618
by Unicode properties, and underscores, but must not start with a digit. */
2619
2620
19.7k
#ifdef SUPPORT_UNICODE
2621
19.7k
if (utf && is_group)
2622
0
  {
2623
0
  uint32_t c, type;
2624
0
  PCRE2_SPTR p = ptr;
2625
2626
0
  GETCHARINC(c, p);  /* Peek at next character */
2627
0
  type = UCD_CHARTYPE(c);
2628
2629
0
  if (type == ucp_Nd)
2630
0
    {
2631
0
    ptr = p;
2632
0
    *errorcodeptr = ERR44;
2633
0
    goto FAILED;
2634
0
    }
2635
2636
0
  for(;;)
2637
0
    {
2638
0
    if (type != ucp_Nd && PRIV(ucp_gentype)[type] != ucp_L &&
2639
0
        c != CHAR_UNDERSCORE) break;
2640
0
    ptr = p;  /* Accept character and peek again */
2641
0
    if (p >= ptrend) break;
2642
0
    GETCHARINC(c, p);
2643
0
    type = UCD_CHARTYPE(c);
2644
0
    }
2645
0
  }
2646
19.7k
else
2647
#else
2648
(void)utf;  /* Avoid compiler warning */
2649
#endif      /* SUPPORT_UNICODE */
2650
2651
/* Handle non-group names and group names in non-UTF modes. A group name must
2652
not start with a digit. If either of the others start with a digit it just
2653
won't be recognized. */
2654
2655
19.7k
  {
2656
19.7k
  if (is_group && IS_DIGIT(*ptr))
2657
5
    {
2658
5
    ++ptr;
2659
5
    *errorcodeptr = ERR44;
2660
5
    goto FAILED;
2661
5
    }
2662
2663
1.41M
  while (ptr < ptrend && MAX_255(*ptr) && (cb->ctypes[*ptr] & ctype_word) != 0)
2664
1.39M
    {
2665
1.39M
    ptr++;
2666
1.39M
    }
2667
19.7k
  }
2668
2669
/* Check name length */
2670
2671
19.7k
if (ptr - *nameptr > MAX_NAME_SIZE)
2672
41
  {
2673
41
  *errorcodeptr = ERR48;
2674
41
  goto FAILED;
2675
41
  }
2676
19.6k
*namelenptr = (uint32_t)(ptr - *nameptr);
2677
2678
/* Subpattern names must not be empty, and their terminator is checked here.
2679
(What follows a verb or alpha assertion name is checked separately.) */
2680
2681
19.6k
if (is_group)
2682
17.5k
  {
2683
17.5k
  if (ptr == *nameptr)
2684
243
    {
2685
243
    *errorcodeptr = ERR62;   /* Subpattern name expected */
2686
243
    goto FAILED;
2687
243
    }
2688
17.2k
  if (is_braced)
2689
293
    while (ptr < ptrend && (*ptr == CHAR_SPACE || *ptr == CHAR_HT)) ptr++;
2690
17.2k
  if (terminator != 0)
2691
16.7k
    {
2692
16.7k
    if (ptr >= ptrend || *ptr != (PCRE2_UCHAR)terminator)
2693
41
      {
2694
41
      *errorcodeptr = ERR42;
2695
41
      goto FAILED;
2696
41
      }
2697
16.6k
    ptr++;
2698
16.6k
    }
2699
17.2k
  }
2700
2701
19.3k
*ptrptr = ptr;
2702
19.3k
return TRUE;
2703
2704
348
FAILED:
2705
348
*ptrptr = ptr;
2706
348
return FALSE;
2707
19.6k
}
2708
2709
2710
2711
/**************************************************
2712
*        Parse capturing bracket argument list    *
2713
**************************************************/
2714
2715
/* Reads a list of capture references. The references
2716
can be numbers or names.
2717
2718
Arguments:
2719
  ptrptr           points to the character pointer variable
2720
  ptrend           points to the end of the input string
2721
  utf              true if the input is UTF-encoded
2722
  parsed_pattern   the parsed pattern pointer
2723
  offset           last known offset
2724
  errcodeptr       where to put an error code
2725
  cb               pointer to the compile data block
2726
2727
Returns: updated parsed_pattern pointer on success
2728
         NULL otherwise
2729
*/
2730
2731
static uint32_t *
2732
parse_capture_list(PCRE2_SPTR *ptrptr, PCRE2_SPTR ptrend,
2733
  BOOL utf, uint32_t *parsed_pattern, PCRE2_SIZE offset,
2734
  int *errorcodeptr, compile_block *cb)
2735
0
{
2736
0
PCRE2_SIZE next_offset;
2737
0
PCRE2_SPTR ptr = *ptrptr;
2738
0
PCRE2_SPTR name;
2739
0
PCRE2_UCHAR terminator;
2740
0
uint32_t meta, namelen;
2741
0
int i;
2742
2743
0
if (ptr >= ptrend || *ptr != CHAR_LEFT_PARENTHESIS)
2744
0
  {
2745
0
  *errorcodeptr = ERR118;
2746
0
  goto FAILED;
2747
0
  }
2748
2749
0
for (;;)
2750
0
  {
2751
0
  ptr++;
2752
0
  next_offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
2753
2754
0
  if (ptr >= ptrend)
2755
0
    {
2756
0
    *errorcodeptr = ERR117;
2757
0
    goto FAILED;
2758
0
    }
2759
2760
  /* Handle [+-]number cases */
2761
0
  if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
2762
0
      &i, errorcodeptr))
2763
0
    {
2764
0
    PCRE2_ASSERT(i >= 0);
2765
0
    if (i <= 0)
2766
0
      {
2767
0
      *errorcodeptr = ERR15;
2768
0
      goto FAILED;
2769
0
      }
2770
0
    meta = META_CAPTURE_NUMBER;
2771
0
    namelen = (uint32_t)i;
2772
0
    }
2773
0
  else if (*errorcodeptr != 0) goto FAILED; /* Number too big */
2774
0
  else
2775
0
    {
2776
    /* Handle 'name' or <name> cases. */
2777
0
    if (*ptr == CHAR_LESS_THAN_SIGN)
2778
0
      terminator = CHAR_GREATER_THAN_SIGN;
2779
0
    else if (*ptr == CHAR_APOSTROPHE)
2780
0
      terminator = CHAR_APOSTROPHE;
2781
0
    else
2782
0
      {
2783
0
      *errorcodeptr = ERR117;
2784
0
      goto FAILED;
2785
0
      }
2786
2787
0
    if (!read_name(&ptr, ptrend, utf, terminator, &next_offset,
2788
0
        &name, &namelen, errorcodeptr, cb)) goto FAILED;
2789
2790
0
    meta = META_CAPTURE_NAME;
2791
0
    }
2792
2793
0
  PCRE2_ASSERT(next_offset > 0);
2794
0
  if (offset == 0 || (next_offset - offset) >= 0x10000)
2795
0
    {
2796
0
    *parsed_pattern++ = META_OFFSET;
2797
0
    PUTOFFSET(next_offset, parsed_pattern);
2798
0
    offset = next_offset;
2799
0
    }
2800
2801
  /* The offset is encoded as a relative offset, because for some
2802
  inputs such as ",2" in (1,2,3), we only have space for two uint32_t
2803
  values, and an opcode and absolute offset may require three uint32_t
2804
  values. */
2805
0
  *parsed_pattern++ = meta | (uint32_t)(next_offset - offset);
2806
0
  *parsed_pattern++ = namelen;
2807
0
  offset = next_offset;
2808
2809
0
  if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
2810
2811
0
  if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
2812
2813
0
  if (*ptr != CHAR_COMMA)
2814
0
    {
2815
0
    *errorcodeptr = ERR24;
2816
0
    goto FAILED;
2817
0
    }
2818
0
  }
2819
2820
0
*ptrptr = ptr + 1;
2821
0
return parsed_pattern;
2822
2823
0
UNCLOSED_PARENTHESIS:
2824
0
*errorcodeptr = ERR14;
2825
2826
0
FAILED:
2827
0
*ptrptr = ptr;
2828
0
return NULL;
2829
0
}
2830
2831
2832
2833
/*************************************************
2834
*          Manage callouts at start of cycle     *
2835
*************************************************/
2836
2837
/* At the start of a new item in parse_regex() we are able to record the
2838
details of the previous item in a prior callout, and also to set up an
2839
automatic callout if enabled. Avoid having two adjacent automatic callouts,
2840
which would otherwise happen for items such as \Q that contribute nothing to
2841
the parsed pattern.
2842
2843
Arguments:
2844
  ptr              current pattern pointer
2845
  pcalloutptr      points to a pointer to previous callout, or NULL
2846
  auto_callout     TRUE if auto_callouts are enabled
2847
  parsed_pattern   the parsed pattern pointer
2848
  cb               compile block
2849
2850
Returns: possibly updated parsed_pattern pointer.
2851
*/
2852
2853
static uint32_t *
2854
manage_callouts(PCRE2_SPTR ptr, uint32_t **pcalloutptr, BOOL auto_callout,
2855
  uint32_t *parsed_pattern, compile_block *cb)
2856
1.19M
{
2857
1.19M
uint32_t *previous_callout = *pcalloutptr;
2858
2859
1.19M
if (previous_callout != NULL) previous_callout[2] = (uint32_t)(ptr -
2860
918
  cb->start_pattern - (PCRE2_SIZE)previous_callout[1]);
2861
2862
1.19M
if (!auto_callout) previous_callout = NULL; else
2863
0
  {
2864
0
  if (previous_callout == NULL ||
2865
0
      previous_callout != parsed_pattern - 4 ||
2866
0
      previous_callout[3] != 255)
2867
0
    {
2868
0
    previous_callout = parsed_pattern;  /* Set up new automatic callout */
2869
0
    parsed_pattern += 4;
2870
0
    previous_callout[0] = META_CALLOUT_NUMBER;
2871
0
    previous_callout[2] = 0;
2872
0
    previous_callout[3] = 255;
2873
0
    }
2874
0
  previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
2875
0
  }
2876
2877
1.19M
*pcalloutptr = previous_callout;
2878
1.19M
return parsed_pattern;
2879
1.19M
}
2880
2881
2882
2883
/*************************************************
2884
*          Handle \d, \D, \s, \S, \w, \W         *
2885
*************************************************/
2886
2887
/* This function is called from parse_regex() below, both for freestanding
2888
escapes, and those within classes, to handle those escapes that may change when
2889
Unicode property support is requested. Note that PCRE2_UCP will never be set
2890
without Unicode support because that is checked when pcre2_compile() is called.
2891
2892
Arguments:
2893
  escape          the ESC_... value
2894
  parsed_pattern  where to add the code
2895
  options         options bits
2896
  xoptions        extra options bits
2897
2898
Returns:          updated value of parsed_pattern
2899
*/
2900
static uint32_t *
2901
handle_escdsw(int escape, uint32_t *parsed_pattern, uint32_t options,
2902
  uint32_t xoptions)
2903
2.45k
{
2904
2.45k
uint32_t ascii_option = 0;
2905
2.45k
uint32_t prop = ESC_p;
2906
2907
2.45k
switch(escape)
2908
2.45k
  {
2909
853
  case ESC_D:
2910
853
  prop = ESC_P;
2911
853
  PCRE2_FALLTHROUGH /* Fall through */
2912
1.02k
  case ESC_d:
2913
1.02k
  ascii_option = PCRE2_EXTRA_ASCII_BSD;
2914
1.02k
  break;
2915
2916
202
  case ESC_S:
2917
202
  prop = ESC_P;
2918
202
  PCRE2_FALLTHROUGH /* Fall through */
2919
924
  case ESC_s:
2920
924
  ascii_option = PCRE2_EXTRA_ASCII_BSS;
2921
924
  break;
2922
2923
326
  case ESC_W:
2924
326
  prop = ESC_P;
2925
326
  PCRE2_FALLTHROUGH /* Fall through */
2926
504
  case ESC_w:
2927
504
  ascii_option = PCRE2_EXTRA_ASCII_BSW;
2928
504
  break;
2929
2.45k
  }
2930
2931
2.45k
if ((options & PCRE2_UCP) == 0 || (xoptions & ascii_option) != 0)
2932
2.45k
  {
2933
2.45k
  *parsed_pattern++ = META_ESCAPE + escape;
2934
2.45k
  }
2935
0
else
2936
0
  {
2937
0
  *parsed_pattern++ = META_ESCAPE + prop;
2938
0
  switch(escape)
2939
0
    {
2940
0
    case ESC_d:
2941
0
    case ESC_D:
2942
0
    *parsed_pattern++ = (PT_PC << 16) | ucp_Nd;
2943
0
    break;
2944
2945
0
    case ESC_s:
2946
0
    case ESC_S:
2947
0
    *parsed_pattern++ = PT_SPACE << 16;
2948
0
    break;
2949
2950
0
    case ESC_w:
2951
0
    case ESC_W:
2952
0
    *parsed_pattern++ = PT_WORD << 16;
2953
0
    break;
2954
0
    }
2955
0
  }
2956
2957
2.45k
return parsed_pattern;
2958
2.45k
}
2959
2960
2961
2962
/*************************************************
2963
* Maximum size of parsed_pattern for given input *
2964
*************************************************/
2965
2966
/* This function is called from parse_regex() below, to determine the amount
2967
of memory to allocate for parsed_pattern. It is also called to check whether
2968
the amount of data written respects the amount of memory allocated.
2969
2970
Arguments:
2971
  ptr             points to the start of the pattern
2972
  ptrend          points to the end of the pattern
2973
  utf             TRUE in UTF mode
2974
  options         the options bits
2975
2976
Returns:          the number of uint32_t units for parsed_pattern
2977
*/
2978
static ptrdiff_t
2979
max_parsed_pattern(PCRE2_SPTR ptr, PCRE2_SPTR ptrend, BOOL utf,
2980
  uint32_t options)
2981
11.1k
{
2982
11.1k
PCRE2_SIZE big32count = 0;
2983
11.1k
ptrdiff_t parsed_size_needed;
2984
2985
/* When PCRE2_AUTO_CALLOUT is not set, in all but one case the number of
2986
unsigned 32-bit ints written out to the parsed pattern is bounded by the length
2987
of the pattern. The exceptional case is when running in 32-bit, non-UTF mode,
2988
when literal characters greater than META_END (0x80000000) have to be coded as
2989
two units. In this case, therefore, we scan the pattern to check for such
2990
values. */
2991
2992
#if PCRE2_CODE_UNIT_WIDTH == 32
2993
if (!utf)
2994
  {
2995
  PCRE2_SPTR p;
2996
  for (p = ptr; p < ptrend; p++) if (*p >= META_END) big32count++;
2997
  }
2998
#else
2999
11.1k
(void)utf;  /* Avoid compiler warning */
3000
11.1k
#endif
3001
3002
11.1k
parsed_size_needed = (ptrend - ptr) + big32count;
3003
3004
/* When PCRE2_AUTO_CALLOUT is set we have to assume a numerical callout (4
3005
elements) for each character. This is overkill, but memory is plentiful these
3006
days. */
3007
3008
11.1k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
3009
0
  parsed_size_needed += (ptrend - ptr) * 4;
3010
3011
11.1k
return parsed_size_needed;
3012
11.1k
}
3013
3014
3015
3016
/*************************************************
3017
*      Parse regex and identify named groups     *
3018
*************************************************/
3019
3020
/* This function is called first of all. It scans the pattern and does two
3021
things: (1) It identifies capturing groups and makes a table of named capturing
3022
groups so that information about them is fully available to both the compiling
3023
scans. (2) It writes a parsed version of the pattern with comments omitted and
3024
escapes processed into the parsed_pattern vector.
3025
3026
Arguments:
3027
  ptr             points to the start of the pattern
3028
  options         compiling dynamic options (may change during the scan)
3029
  has_lookbehind  points to a boolean, set TRUE if a lookbehind is found
3030
  cb              pointer to the compile data block
3031
3032
Returns:   zero on success or a non-zero error code, with the
3033
             error offset placed in the cb field
3034
*/
3035
3036
/* A structure and some flags for dealing with nested groups. */
3037
3038
typedef struct nest_save {
3039
  uint16_t  nest_depth;
3040
  uint16_t  reset_group;
3041
  uint16_t  max_group;
3042
  uint16_t  flags;
3043
  uint32_t  options;
3044
  uint32_t  xoptions;
3045
} nest_save;
3046
3047
6.61k
#define NSF_RESET          0x0001u
3048
4.56k
#define NSF_CONDASSERT     0x0002u
3049
3.52k
#define NSF_ATOMICSR       0x0004u
3050
3051
/* Options that are changeable within the pattern must be tracked during
3052
parsing. Some (e.g. PCRE2_EXTENDED) are implemented entirely during parsing,
3053
but all must be tracked so that META_OPTIONS items set the correct values for
3054
the main compiling phase. */
3055
3056
8.30k
#define PARSE_TRACKED_OPTIONS (PCRE2_CASELESS|PCRE2_DOTALL|PCRE2_DUPNAMES| \
3057
8.30k
  PCRE2_EXTENDED|PCRE2_EXTENDED_MORE|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE| \
3058
8.30k
  PCRE2_UNGREEDY)
3059
3060
8.30k
#define PARSE_TRACKED_EXTRA_OPTIONS (PCRE2_EXTRA_CASELESS_RESTRICT| \
3061
8.30k
  PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW| \
3062
8.30k
  PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX)
3063
3064
/* States used for analyzing ranges in character classes. The two OK values
3065
must be last. */
3066
3067
enum {
3068
  RANGE_NO, /* State after '[' (initial), or '[a-z'; hyphen is literal */
3069
  RANGE_STARTED, /* State after '[1-'; last-emitted code is META_RANGE_XYZ */
3070
  RANGE_FORBID_NO, /* State after '[\d'; '-]' is allowed but not '-1]' */
3071
  RANGE_FORBID_STARTED, /* State after '[\d-'*/
3072
  RANGE_OK_ESCAPED, /* State after '[\1'; hyphen may be a range */
3073
  RANGE_OK_LITERAL /* State after '[1'; hyphen may be a range */
3074
};
3075
3076
/* States used for analyzing operators and operands in extended character
3077
classes. */
3078
3079
enum {
3080
  CLASS_OP_EMPTY, /* At start of an expression; empty previous contents */
3081
  CLASS_OP_OPERAND, /* Have preceding operand; after "z" a "--" can follow */
3082
  CLASS_OP_OPERATOR /* Have preceding operator; after "--" operand must follow */
3083
};
3084
3085
/* States used for determining the parse mode in character classes. The two
3086
PERL_EXT values must be last. */
3087
3088
enum {
3089
  CLASS_MODE_NORMAL, /* Ordinary PCRE2 '[...]' class. */
3090
  CLASS_MODE_ALT_EXT, /* UTS#18-style extended '[...]' class. */
3091
  CLASS_MODE_PERL_EXT, /* Perl extended '(?[...])' class. */
3092
  CLASS_MODE_PERL_EXT_LEAF /* Leaf within extended '(?[ [...] ])' class. */
3093
};
3094
3095
/* Only in 32-bit mode can there be literals > META_END. A macro encapsulates
3096
the storing of literal values in the main parsed pattern, where they can always
3097
be quantified. */
3098
3099
#if PCRE2_CODE_UNIT_WIDTH == 32
3100
#define PARSED_LITERAL(c, p) \
3101
  { \
3102
  if (c >= META_END) *p++ = META_BIGVALUE; \
3103
  *p++ = c; \
3104
  okquantifier = TRUE; \
3105
  }
3106
#else
3107
359k
#define PARSED_LITERAL(c, p) *p++ = c; okquantifier = TRUE;
3108
#endif
3109
3110
/* Here's the actual function. */
3111
3112
static int parse_regex(PCRE2_SPTR ptr, uint32_t options, uint32_t xoptions,
3113
  BOOL *has_lookbehind, compile_block *cb)
3114
11.1k
{
3115
11.1k
uint32_t c;
3116
11.1k
uint32_t delimiter;
3117
11.1k
uint32_t namelen;
3118
11.1k
uint32_t class_range_state;
3119
11.1k
uint32_t class_op_state;
3120
11.1k
uint32_t class_mode_state;
3121
11.1k
uint32_t *class_start;
3122
11.1k
uint32_t *verblengthptr = NULL;     /* Value avoids compiler warning */
3123
11.1k
uint32_t *verbstartptr = NULL;
3124
11.1k
uint32_t *previous_callout = NULL;
3125
11.1k
uint32_t *parsed_pattern = cb->parsed_pattern;
3126
11.1k
uint32_t *parsed_pattern_end = cb->parsed_pattern_end;
3127
11.1k
uint32_t *this_parsed_item = NULL;
3128
11.1k
uint32_t *prev_parsed_item = NULL;
3129
11.1k
uint32_t meta_quantifier = 0;
3130
11.1k
uint32_t add_after_mark = 0;
3131
11.1k
uint16_t nest_depth = 0;
3132
11.1k
int16_t class_depth_m1 = -1; /* The m1 means minus 1. */
3133
11.1k
int16_t class_maxdepth_m1 = -1;
3134
11.1k
uint16_t hash;
3135
11.1k
int after_manual_callout = 0;
3136
11.1k
int expect_cond_assert = 0;
3137
11.1k
int errorcode = 0;
3138
11.1k
int escape;
3139
11.1k
int i;
3140
11.1k
BOOL inescq = FALSE;
3141
11.1k
BOOL inverbname = FALSE;
3142
11.1k
BOOL utf = (options & PCRE2_UTF) != 0;
3143
11.1k
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
3144
11.1k
BOOL is_dupname;
3145
11.1k
BOOL negate_class;
3146
11.1k
BOOL okquantifier = FALSE;
3147
11.1k
PCRE2_SPTR thisptr;
3148
11.1k
PCRE2_SPTR name;
3149
11.1k
PCRE2_SPTR ptrend = cb->end_pattern;
3150
11.1k
PCRE2_SPTR verbnamestart = NULL;    /* Value avoids compiler warning */
3151
11.1k
PCRE2_SPTR class_range_forbid_ptr = NULL;
3152
11.1k
named_group *ng;
3153
11.1k
nest_save *top_nest, *end_nests;
3154
#ifdef PCRE2_DEBUG
3155
uint32_t *parsed_pattern_check;
3156
ptrdiff_t parsed_pattern_extra = 0;
3157
ptrdiff_t parsed_pattern_extra_check = 0;
3158
PCRE2_SPTR ptr_check;
3159
#endif
3160
3161
11.1k
PCRE2_ASSERT(parsed_pattern != NULL);
3162
3163
/* Insert leading items for word and line matching (features provided for the
3164
benefit of pcre2grep). */
3165
3166
11.1k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
3167
0
  {
3168
0
  *parsed_pattern++ = META_CIRCUMFLEX;
3169
0
  *parsed_pattern++ = META_NOCAPTURE;
3170
0
  }
3171
11.1k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
3172
0
  {
3173
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
3174
0
  *parsed_pattern++ = META_NOCAPTURE;
3175
0
  }
3176
3177
#ifdef PCRE2_DEBUG
3178
parsed_pattern_check = parsed_pattern;
3179
ptr_check = ptr;
3180
#endif
3181
3182
/* If the pattern is actually a literal string, process it separately to avoid
3183
cluttering up the main loop. */
3184
3185
11.1k
if ((options & PCRE2_LITERAL) != 0)
3186
0
  {
3187
0
  while (ptr < ptrend)
3188
0
    {
3189
    /* LCOV_EXCL_START */
3190
0
    if (parsed_pattern >= parsed_pattern_end)
3191
0
      {
3192
0
      PCRE2_DEBUG_UNREACHABLE();
3193
0
      errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3194
0
      goto FAILED;
3195
0
      }
3196
    /* LCOV_EXCL_STOP */
3197
3198
0
    thisptr = ptr;
3199
0
    GETCHARINCTEST(c, ptr);
3200
0
    if (auto_callout)
3201
0
      parsed_pattern = manage_callouts(thisptr, &previous_callout,
3202
0
        auto_callout, parsed_pattern, cb);
3203
0
    PARSED_LITERAL(c, parsed_pattern);
3204
0
    }
3205
0
  goto PARSED_END;
3206
0
  }
3207
3208
/* Process a real regex which may contain meta-characters. */
3209
3210
11.1k
top_nest = NULL;
3211
11.1k
end_nests = (nest_save *)(cb->start_workspace + cb->workspace_size);
3212
3213
/* The size of the nest_save structure might not be a factor of the size of the
3214
workspace. Therefore we must round down end_nests so as to correctly avoid
3215
creating a nest_save that spans the end of the workspace. */
3216
3217
11.1k
end_nests = (nest_save *)((char *)end_nests -
3218
11.1k
  ((cb->workspace_size * sizeof(PCRE2_UCHAR)) % sizeof(nest_save)));
3219
3220
/* PCRE2_EXTENDED_MORE implies PCRE2_EXTENDED */
3221
3222
11.1k
if ((options & PCRE2_EXTENDED_MORE) != 0) options |= PCRE2_EXTENDED;
3223
3224
/* Now scan the pattern */
3225
3226
1.84M
while (ptr < ptrend)
3227
1.84M
  {
3228
1.84M
  int prev_expect_cond_assert;
3229
1.84M
  uint32_t min_repeat = 0, max_repeat = 0;
3230
1.84M
  uint32_t set, unset, *optset;
3231
1.84M
  uint32_t xset, xunset, *xoptset;
3232
1.84M
  uint32_t terminator;
3233
1.84M
  uint32_t prev_meta_quantifier;
3234
1.84M
  BOOL prev_okquantifier;
3235
1.84M
  PCRE2_SPTR tempptr;
3236
1.84M
  PCRE2_SIZE offset;
3237
3238
1.84M
  if (nest_depth > cb->cx->parens_nest_limit)
3239
2
    {
3240
2
    errorcode = ERR19;
3241
2
    goto FAILED;        /* Parentheses too deeply nested */
3242
2
    }
3243
3244
  /* Check that we haven't emitted too much into parsed_pattern. We allocate
3245
  a suitably-sized buffer upfront, then do unchecked writes to it. If we only
3246
  write a little bit too much, everything will appear to be OK, because the
3247
  upfront size is an overestimate... but a malicious pattern could end up
3248
  forcing a write past the buffer end. We must catch this during
3249
  development. */
3250
3251
#ifdef PCRE2_DEBUG
3252
  /* Strong post-write check. Won't help in release builds - at this point
3253
  the write has already occurred so it's too late. However, should stop us
3254
  committing unsafe code. */
3255
  PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
3256
               (parsed_pattern_extra - parsed_pattern_extra_check) <=
3257
                 max_parsed_pattern(ptr_check, ptr, utf, options));
3258
  parsed_pattern_check = parsed_pattern;
3259
  parsed_pattern_extra_check = parsed_pattern_extra;
3260
  ptr_check = ptr;
3261
#endif
3262
3263
  /* LCOV_EXCL_START */
3264
1.84M
  if (parsed_pattern >= parsed_pattern_end)
3265
0
    {
3266
    /* Weak pre-write check; only ensures parsed_pattern[0] is writeable
3267
    (but the code below can write many chars). Better than nothing. */
3268
0
    PCRE2_DEBUG_UNREACHABLE();
3269
0
    errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
3270
0
    goto FAILED;
3271
0
    }
3272
  /* LCOV_EXCL_STOP */
3273
3274
  /* If the last time round this loop something was added, parsed_pattern will
3275
  no longer be equal to this_parsed_item. Remember where the previous item
3276
  started and reset for the next item. Note that sometimes round the loop,
3277
  nothing gets added (e.g. for ignored white space). */
3278
3279
1.84M
  if (this_parsed_item != parsed_pattern)
3280
1.83M
    {
3281
1.83M
    prev_parsed_item = this_parsed_item;
3282
1.83M
    this_parsed_item = parsed_pattern;
3283
1.83M
    }
3284
3285
  /* Get next input character, save its position for callout handling. */
3286
3287
1.84M
  thisptr = ptr;
3288
1.84M
  GETCHARINCTEST(c, ptr);
3289
3290
  /* Copy quoted literals until \E, allowing for the possibility of automatic
3291
  callouts, except when processing a (*VERB) "name".  */
3292
3293
1.84M
  if (inescq)
3294
3.65k
    {
3295
3.65k
    if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3296
384
      {
3297
384
      inescq = FALSE;
3298
384
      ptr++;   /* Skip E */
3299
384
      }
3300
3.26k
    else
3301
3.26k
      {
3302
3.26k
      if (inverbname)
3303
0
        {                          /* Don't use PARSED_LITERAL() because it */
3304
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3305
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3306
#endif
3307
0
        *parsed_pattern++ = c;
3308
0
        }
3309
3.26k
      else
3310
3.26k
        {
3311
3.26k
        if (after_manual_callout-- <= 0)
3312
3.03k
          parsed_pattern = manage_callouts(thisptr, &previous_callout,
3313
3.03k
            auto_callout, parsed_pattern, cb);
3314
3.26k
        PARSED_LITERAL(c, parsed_pattern);
3315
3.26k
        }
3316
3.26k
      meta_quantifier = 0;
3317
3.26k
      }
3318
3.65k
    continue;  /* Next character */
3319
3.65k
    }
3320
3321
  /* If we are processing the "name" part of a (*VERB:NAME) item, all
3322
  characters up to the closing parenthesis are literals except when
3323
  PCRE2_ALT_VERBNAMES is set. That causes backslash interpretation, but only \Q
3324
  and \E and escaped characters are allowed (no character types such as \d). If
3325
  PCRE2_EXTENDED is also set, we must ignore white space and # comments. Do
3326
  this by not entering the special (*VERB:NAME) processing - they are then
3327
  picked up below. Note that c is a character, not a code unit, so we must not
3328
  use MAX_255 to test its size because MAX_255 tests code units and is assumed
3329
  TRUE in 8-bit mode. */
3330
3331
1.83M
  if (inverbname &&
3332
524k
       (
3333
        /* EITHER: not both options set */
3334
524k
        ((options & (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) !=
3335
524k
                    (PCRE2_EXTENDED | PCRE2_ALT_VERBNAMES)) ||
3336
0
#ifdef SUPPORT_UNICODE
3337
        /* OR: character > 255 AND not Unicode Pattern White Space */
3338
0
        (c > 255 && (c|1) != 0x200f && (c|1) != 0x2029) ||
3339
0
#endif
3340
        /* OR: not a # comment or isspace() white space */
3341
0
        (c < 256 && c != CHAR_NUMBER_SIGN && (cb->ctypes[c] & ctype_space) == 0
3342
0
#ifdef SUPPORT_UNICODE
3343
        /* and not CHAR_NEL when Unicode is supported */
3344
0
          && c != CHAR_NEL
3345
0
#endif
3346
0
       )))
3347
524k
    {
3348
524k
    PCRE2_SIZE verbnamelength;
3349
3350
524k
    switch(c)
3351
524k
      {
3352
522k
      default:                     /* Don't use PARSED_LITERAL() because it */
3353
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3354
      if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3355
#endif
3356
522k
      *parsed_pattern++ = c;
3357
522k
      break;
3358
3359
708
      case CHAR_RIGHT_PARENTHESIS:
3360
708
      inverbname = FALSE;
3361
      /* This is the length in characters */
3362
708
      verbnamelength = (PCRE2_SIZE)(parsed_pattern - verblengthptr - 1);
3363
      /* But the limit on the length is in code units */
3364
708
      if (ptr - verbnamestart - 1 > (int)MAX_MARK)
3365
1
        {
3366
1
        ptr--;
3367
1
        errorcode = ERR76;
3368
1
        goto FAILED;
3369
1
        }
3370
707
      *verblengthptr = (uint32_t)verbnamelength;
3371
3372
      /* If this name was on a verb such as (*ACCEPT) which does not continue,
3373
      a (*MARK) was generated for the name. We now add the original verb as the
3374
      next item. */
3375
3376
707
      if (add_after_mark != 0)
3377
2
        {
3378
2
        *parsed_pattern++ = add_after_mark;
3379
2
        add_after_mark = 0;
3380
2
        }
3381
707
      break;
3382
3383
939
      case CHAR_BACKSLASH:
3384
939
      if ((options & PCRE2_ALT_VERBNAMES) != 0)
3385
0
        {
3386
0
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3387
0
          xoptions, cb->bracount, FALSE, cb);
3388
0
        if (errorcode != 0) goto FAILED;
3389
0
        }
3390
939
      else escape = 0;   /* Treat all as literal */
3391
3392
939
      switch(escape)
3393
939
        {
3394
939
        case 0:                    /* Don't use PARSED_LITERAL() because it */
3395
#if PCRE2_CODE_UNIT_WIDTH == 32    /* sets okquantifier. */
3396
        if (c >= META_END) *parsed_pattern++ = META_BIGVALUE;
3397
#endif
3398
939
        *parsed_pattern++ = c;
3399
939
        break;
3400
3401
0
        case ESC_ub:
3402
0
        *parsed_pattern++ = CHAR_u;
3403
0
        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3404
0
        break;
3405
3406
0
        case ESC_Q:
3407
0
        inescq = TRUE;
3408
0
        break;
3409
3410
0
        case ESC_E:           /* Ignore */
3411
0
        break;
3412
3413
0
        default:
3414
0
        errorcode = ERR40;    /* Invalid in verb name */
3415
0
        goto FAILED;
3416
939
        }
3417
524k
      }
3418
524k
    continue;   /* Next character in pattern */
3419
524k
    }
3420
3421
  /* Not a verb name character. At this point we must process everything that
3422
  must not change the quantification state. This is mainly comments, but we
3423
  handle \Q and \E here as well, so that an item such as A\Q\E+ is treated as
3424
  A+, as in Perl. An isolated \E is ignored. */
3425
3426
1.31M
  if (c == CHAR_BACKSLASH && ptr < ptrend)
3427
19.3k
    {
3428
19.3k
    if (*ptr == CHAR_Q || *ptr == CHAR_E)
3429
550
      {
3430
      /* A literal inside a \Q...\E is not allowed if we are expecting a
3431
      conditional assertion, but an empty \Q\E sequence is OK. */
3432
550
      if (expect_cond_assert > 0 && *ptr == CHAR_Q &&
3433
0
          !(ptrend - ptr >= 3 && ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E))
3434
0
        {
3435
0
        ptr--;
3436
0
        errorcode = ERR28;
3437
0
        goto FAILED;
3438
0
        }
3439
550
      inescq = *ptr == CHAR_Q;
3440
550
      ptr++;
3441
550
      continue;
3442
550
      }
3443
19.3k
    }
3444
3445
  /* Skip over whitespace and # comments in extended mode. Note that c is a
3446
  character, not a code unit, so we must not use MAX_255 to test its size
3447
  because MAX_255 tests code units and is assumed TRUE in 8-bit mode. The
3448
  whitespace characters are those designated as "Pattern White Space" by
3449
  Unicode, which are the isspace() characters plus CHAR_NEL (newline), which is
3450
  U+0085 in Unicode, plus U+200E, U+200F, U+2028, and U+2029. These are a
3451
  subset of space characters that match \h and \v. */
3452
3453
1.31M
  if ((options & PCRE2_EXTENDED) != 0)
3454
10.4k
    {
3455
10.4k
    if (c < 256 && (cb->ctypes[c] & ctype_space) != 0) continue;
3456
9.97k
#ifdef SUPPORT_UNICODE
3457
9.97k
    if (c == CHAR_NEL || (c|1) == 0x200f || (c|1) == 0x2029) continue;
3458
9.96k
#endif
3459
9.96k
    if (c == CHAR_NUMBER_SIGN)
3460
1.37k
      {
3461
4.85k
      while (ptr < ptrend)
3462
4.78k
        {
3463
4.78k
        if (IS_NEWLINE(ptr))      /* For non-fixed-length newline cases, */
3464
1.30k
          {                       /* IS_NEWLINE sets cb->nllen. */
3465
1.30k
          ptr += cb->nllen;
3466
1.30k
          break;
3467
1.30k
          }
3468
3.48k
        ptr++;
3469
3.48k
#ifdef SUPPORT_UNICODE
3470
3.48k
        if (utf) FORWARDCHARTEST(ptr, ptrend);
3471
3.48k
#endif
3472
3.48k
        }
3473
1.37k
      continue;  /* Next character in pattern */
3474
1.37k
      }
3475
9.96k
    }
3476
3477
  /* Skip over bracketed comments */
3478
3479
1.31M
  if (c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 2 &&
3480
432k
      ptr[0] == CHAR_QUESTION_MARK && ptr[1] == CHAR_NUMBER_SIGN)
3481
253
    {
3482
959
    while (++ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS);
3483
253
    if (ptr >= ptrend)
3484
20
      {
3485
20
      errorcode = ERR18;  /* A special error for missing ) in a comment */
3486
20
      goto FAILED;        /* to make it easier to debug. */
3487
20
      }
3488
233
    ptr++;
3489
233
    continue;  /* Next character in pattern */
3490
253
    }
3491
3492
  /* If the next item is not a quantifier, fill in length of any previous
3493
  callout and create an auto callout if required. */
3494
3495
1.30M
  if (c != CHAR_ASTERISK && c != CHAR_PLUS && c != CHAR_QUESTION_MARK &&
3496
1.19M
       (c != CHAR_LEFT_CURLY_BRACKET ||
3497
13.6k
         (tempptr = ptr,
3498
13.6k
         !read_repeat_counts(&tempptr, ptrend, NULL, NULL, &errorcode))))
3499
1.18M
    {
3500
1.18M
    if (after_manual_callout-- <= 0)
3501
1.18M
      {
3502
1.18M
      parsed_pattern = manage_callouts(thisptr, &previous_callout, auto_callout,
3503
1.18M
        parsed_pattern, cb);
3504
1.18M
      this_parsed_item = parsed_pattern;  /* New start for current item */
3505
1.18M
      }
3506
1.18M
    }
3507
3508
  /* If expect_cond_assert is 2, we have just passed (?( and are expecting an
3509
  assertion, possibly preceded by a callout. If the value is 1, we have just
3510
  had the callout and expect an assertion. There must be at least 3 more
3511
  characters in all cases. When expect_cond_assert is 2, we know that the
3512
  current character is an opening parenthesis, as otherwise we wouldn't be
3513
  here. However, when it is 1, we need to check, and it's easiest just to check
3514
  always. Note that expect_cond_assert may be negative, since all callouts just
3515
  decrement it. */
3516
3517
1.30M
  if (expect_cond_assert > 0)
3518
1.32k
    {
3519
1.32k
    BOOL ok = c == CHAR_LEFT_PARENTHESIS && ptrend - ptr >= 3 &&
3520
1.30k
              (ptr[0] == CHAR_QUESTION_MARK || ptr[0] == CHAR_ASTERISK);
3521
1.32k
    if (ok)
3522
1.30k
      {
3523
1.30k
      if (ptr[0] == CHAR_ASTERISK)  /* New alpha assertion format, possibly */
3524
0
        {
3525
0
        ok = MAX_255(ptr[1]) && (cb->ctypes[ptr[1]] & ctype_lcletter) != 0;
3526
0
        }
3527
1.30k
      else switch(ptr[1])  /* Traditional symbolic format */
3528
1.30k
        {
3529
264
        case CHAR_C:
3530
264
        ok = expect_cond_assert == 2;
3531
264
        break;
3532
3533
571
        case CHAR_EQUALS_SIGN:
3534
827
        case CHAR_EXCLAMATION_MARK:
3535
827
        break;
3536
3537
211
        case CHAR_LESS_THAN_SIGN:
3538
211
        ok = ptr[2] == CHAR_EQUALS_SIGN || ptr[2] == CHAR_EXCLAMATION_MARK;
3539
211
        break;
3540
3541
0
        default:
3542
0
        ok = FALSE;
3543
1.30k
        }
3544
1.30k
      }
3545
3546
1.32k
    if (!ok)
3547
33
      {
3548
33
      errorcode = ERR28;
3549
33
      if (expect_cond_assert == 2) goto FAILED;
3550
2
      goto FAILED_BACK;
3551
33
      }
3552
1.32k
    }
3553
3554
  /* Remember whether we are expecting a conditional assertion, and set the
3555
  default for this item. */
3556
3557
1.30M
  prev_expect_cond_assert = expect_cond_assert;
3558
1.30M
  expect_cond_assert = 0;
3559
3560
  /* Remember quantification status for the previous significant item, then set
3561
  default for this item. */
3562
3563
1.30M
  prev_okquantifier = okquantifier;
3564
1.30M
  prev_meta_quantifier = meta_quantifier;
3565
1.30M
  okquantifier = FALSE;
3566
1.30M
  meta_quantifier = 0;
3567
3568
  /* If the previous significant item was a quantifier, adjust the parsed code
3569
  if there is a following modifier. The base meta value is always followed by
3570
  the PLUS and QUERY values, in that order. We do this here rather than after
3571
  reading a quantifier so that intervening comments and /x whitespace can be
3572
  ignored without having to replicate code. */
3573
3574
1.30M
  if (prev_meta_quantifier != 0 && (c == CHAR_QUESTION_MARK || c == CHAR_PLUS))
3575
5.75k
    {
3576
5.75k
    parsed_pattern[(prev_meta_quantifier == META_MINMAX)? -3 : -1] =
3577
5.75k
      prev_meta_quantifier + ((c == CHAR_QUESTION_MARK)?
3578
4.99k
        0x00020000u : 0x00010000u);
3579
5.75k
    continue;  /* Next character in pattern */
3580
5.75k
    }
3581
3582
  /* Process the next item in the main part of a pattern. */
3583
3584
1.30M
  switch(c)
3585
1.30M
    {
3586
284k
    default:              /* Non-special character */
3587
284k
    PARSED_LITERAL(c, parsed_pattern);
3588
284k
    break;
3589
3590
3591
    /* ---- Escape sequence ---- */
3592
3593
18.8k
    case CHAR_BACKSLASH:
3594
18.8k
    tempptr = ptr;
3595
18.8k
    escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
3596
18.8k
      xoptions, cb->bracount, FALSE, cb);
3597
18.8k
    if (errorcode != 0)
3598
381
      {
3599
476
      ESCAPE_FAILED:
3600
476
      if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0)
3601
476
        goto FAILED;
3602
0
      ptr = tempptr;
3603
0
      if (ptr >= ptrend) c = CHAR_BACKSLASH; else
3604
0
        {
3605
0
        GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
3606
0
        }
3607
0
      escape = 0;                 /* Treat as literal character */
3608
0
      }
3609
3610
    /* The escape was a data escape or literal character. */
3611
3612
18.4k
    if (escape == 0)
3613
3.51k
      {
3614
3.51k
      PARSED_LITERAL(c, parsed_pattern);
3615
3.51k
      }
3616
3617
    /* The escape was a back (or forward) reference. We keep the offset in
3618
    order to give a more useful diagnostic for a bad forward reference. For
3619
    references to groups numbered less than 10 we can't use more than two items
3620
    in parsed_pattern because they may be just two characters in the input (and
3621
    in a 64-bit world an offset may need two elements). So for them, the offset
3622
    of the first occurrent is held in a special vector. */
3623
3624
14.9k
    else if (escape < 0)
3625
3.83k
      {
3626
3.83k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
3627
3.83k
      escape = -escape - 1;
3628
3.83k
      *parsed_pattern++ = META_BACKREF | (uint32_t)escape;
3629
3.83k
      if (escape < 10)
3630
3.39k
        {
3631
3.39k
        if (cb->small_ref_offset[escape] == PCRE2_UNSET)
3632
643
          cb->small_ref_offset[escape] = offset;
3633
3.39k
        }
3634
438
      else
3635
438
        {
3636
438
        PUTOFFSET(offset, parsed_pattern);
3637
438
        }
3638
3.83k
      okquantifier = TRUE;
3639
3.83k
      }
3640
3641
    /* The escape was a character class such as \d etc. or other special
3642
    escape indicator such as \A or \X. Most of them generate just a single
3643
    parsed item, but \P and \p are followed by a 16-bit type and a 16-bit
3644
    value. They are supported only when Unicode is available. The type and
3645
    value are packed into a single 32-bit value so that the whole sequences
3646
    uses only two elements in the parsed_vector. This is because the same
3647
    coding is used if \d (for example) is turned into \p{Nd} when PCRE2_UCP is
3648
    set.
3649
3650
    There are also some cases where the escape sequence is followed by a name:
3651
    \k{name}, \k<name>, and \k'name' are backreferences by name, and \g<name>
3652
    and \g'name' are subroutine calls by name; \g{name} is a synonym for
3653
    \k{name}. Note that \g<number> and \g'number' are handled by check_escape()
3654
    and returned as a negative value (handled above). A name is coded as an
3655
    offset into the pattern and a length. */
3656
3657
11.0k
    else switch (escape)
3658
11.0k
      {
3659
804
      case ESC_C:
3660
#ifdef NEVER_BACKSLASH_C
3661
      errorcode = ERR85;
3662
      goto ESCAPE_FAILED;
3663
#else
3664
804
      if ((options & PCRE2_NEVER_BACKSLASH_C) != 0)
3665
0
        {
3666
0
        errorcode = ERR83;
3667
0
        goto ESCAPE_FAILED;
3668
0
        }
3669
804
#endif
3670
804
      okquantifier = TRUE;
3671
804
      *parsed_pattern++ = META_ESCAPE + escape;
3672
804
      break;
3673
3674
      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
3675
      when \u{ is not followed by hex digits and }. It requests two literal
3676
      characters, u and { and we need this, as otherwise \u{ 12} (for example)
3677
      would be treated as u{12} now that spaces are allowed in quantifiers. */
3678
3679
0
      case ESC_ub:
3680
0
      *parsed_pattern++ = CHAR_u;
3681
0
      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
3682
0
      break;
3683
3684
1
      case ESC_X:
3685
#ifndef SUPPORT_UNICODE
3686
      errorcode = ERR45;   /* Supported only with Unicode support */
3687
      goto ESCAPE_FAILED;
3688
#endif
3689
66
      case ESC_H:
3690
135
      case ESC_h:
3691
453
      case ESC_N:
3692
527
      case ESC_R:
3693
727
      case ESC_V:
3694
787
      case ESC_v:
3695
787
      okquantifier = TRUE;
3696
787
      *parsed_pattern++ = META_ESCAPE + escape;
3697
787
      break;
3698
3699
1.30k
      default:  /* \A, \B, \b, \G, \K, \Z, \z cannot be quantified. */
3700
1.30k
      *parsed_pattern++ = META_ESCAPE + escape;
3701
1.30k
      break;
3702
3703
      /* Escapes that may change in UCP mode. */
3704
3705
75
      case ESC_d:
3706
343
      case ESC_D:
3707
413
      case ESC_s:
3708
488
      case ESC_S:
3709
557
      case ESC_w:
3710
634
      case ESC_W:
3711
634
      okquantifier = TRUE;
3712
634
      parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
3713
634
        xoptions);
3714
634
      break;
3715
3716
      /* Unicode property matching */
3717
3718
3
      case ESC_P:
3719
4
      case ESC_p:
3720
4
#ifdef SUPPORT_UNICODE
3721
4
        {
3722
4
        BOOL negated;
3723
4
        uint16_t ptype = 0, pdata = 0;
3724
4
        if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
3725
4
          goto ESCAPE_FAILED;
3726
0
        if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
3727
0
        *parsed_pattern++ = META_ESCAPE + escape;
3728
0
        *parsed_pattern++ = (ptype << 16) | pdata;
3729
0
        okquantifier = TRUE;
3730
0
        }
3731
#else
3732
      errorcode = ERR45;
3733
      goto ESCAPE_FAILED;
3734
#endif
3735
0
      break;  /* End \P and \p */
3736
3737
      /* When \g is used with quotes or angle brackets as delimiters, it is a
3738
      numerical or named subroutine call, and control comes here. When used
3739
      with brace delimiters it is a numerical back reference and does not come
3740
      here because check_escape() returns it directly as a reference. \k is
3741
      always a named back reference. */
3742
3743
5.10k
      case ESC_g:
3744
7.55k
      case ESC_k:
3745
7.55k
      if (ptr >= ptrend || (*ptr != CHAR_LEFT_CURLY_BRACKET &&
3746
7.23k
          *ptr != CHAR_LESS_THAN_SIGN && *ptr != CHAR_APOSTROPHE))
3747
14
        {
3748
14
        errorcode = (escape == ESC_g)? ERR57 : ERR69;
3749
14
        goto ESCAPE_FAILED;
3750
14
        }
3751
7.54k
      terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
3752
7.21k
        CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
3753
6.89k
        CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
3754
3755
      /* For a non-braced \g, check for a numerical recursion. */
3756
3757
7.54k
      if (escape == ESC_g && terminator != CHAR_RIGHT_CURLY_BRACKET)
3758
5.10k
        {
3759
5.10k
        PCRE2_SPTR p = ptr + 1;
3760
3761
5.10k
        if (read_number(&p, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
3762
5.10k
            &errorcode))
3763
4.56k
          {
3764
4.56k
          if (p >= ptrend || *p != terminator)
3765
15
            {
3766
15
            ptr = p;
3767
15
            errorcode = ERR119;  /* Missing terminator for number */
3768
15
            goto ESCAPE_FAILED;
3769
15
            }
3770
4.55k
          ptr = p + 1;
3771
4.55k
          goto SET_RECURSION;
3772
4.56k
          }
3773
539
        if (errorcode != 0) goto ESCAPE_FAILED;
3774
539
        }
3775
3776
      /* Not a numerical recursion. Perl allows spaces and tabs after { and
3777
      before } but not for other delimiters. */
3778
3779
2.96k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
3780
2.96k
          &errorcode, cb)) goto ESCAPE_FAILED;
3781
3782
      /* \k and \g when used with braces are back references, whereas \g used
3783
      with quotes or angle brackets is a recursion */
3784
3785
2.91k
      *parsed_pattern++ =
3786
2.91k
        (escape == ESC_k || terminator == CHAR_RIGHT_CURLY_BRACKET)?
3787
2.39k
          META_BACKREF_BYNAME : META_RECURSE_BYNAME;
3788
2.91k
      *parsed_pattern++ = namelen;
3789
3790
2.91k
      PUTOFFSET(offset, parsed_pattern);
3791
2.91k
      okquantifier = TRUE;
3792
2.91k
      break;  /* End special escape processing */
3793
11.0k
      }
3794
13.7k
    break;    /* End escape sequence processing */
3795
3796
3797
    /* ---- Single-character special items ---- */
3798
3799
13.7k
    case CHAR_CIRCUMFLEX_ACCENT:
3800
3.90k
    *parsed_pattern++ = META_CIRCUMFLEX;
3801
3.90k
    break;
3802
3803
601
    case CHAR_DOLLAR_SIGN:
3804
601
    *parsed_pattern++ = META_DOLLAR;
3805
601
    break;
3806
3807
5.48k
    case CHAR_DOT:
3808
5.48k
    *parsed_pattern++ = META_DOT;
3809
5.48k
    okquantifier = TRUE;
3810
5.48k
    break;
3811
3812
3813
    /* ---- Single-character quantifiers ---- */
3814
3815
5.92k
    case CHAR_ASTERISK:
3816
5.92k
    meta_quantifier = META_ASTERISK;
3817
5.92k
    goto CHECK_QUANTIFIER;
3818
3819
8.25k
    case CHAR_PLUS:
3820
8.25k
    meta_quantifier = META_PLUS;
3821
8.25k
    goto CHECK_QUANTIFIER;
3822
3823
92.4k
    case CHAR_QUESTION_MARK:
3824
92.4k
    meta_quantifier = META_QUERY;
3825
92.4k
    goto CHECK_QUANTIFIER;
3826
3827
3828
    /* ---- Potential {n,m} quantifier ---- */
3829
3830
13.6k
    case CHAR_LEFT_CURLY_BRACKET:
3831
13.6k
    if (!read_repeat_counts(&ptr, ptrend, &min_repeat, &max_repeat,
3832
13.6k
        &errorcode))
3833
5.04k
      {
3834
5.04k
      if (errorcode != 0) goto FAILED;     /* Error in quantifier. */
3835
5.04k
      PARSED_LITERAL(c, parsed_pattern);   /* Not a quantifier */
3836
5.04k
      break;                               /* No more quantifier processing */
3837
5.04k
      }
3838
8.64k
    meta_quantifier = META_MINMAX;
3839
    /* Fall through */
3840
3841
3842
    /* ---- Quantifier post-processing ---- */
3843
3844
    /* Check that a quantifier is allowed after the previous item. This
3845
    guarantees that there is a previous item. */
3846
3847
115k
    CHECK_QUANTIFIER:
3848
115k
    if (!prev_okquantifier)
3849
53
      {
3850
53
      errorcode = ERR9;
3851
53
      goto FAILED;
3852
53
      }
3853
3854
    /* Most (*VERB)s are not allowed to be quantified, but an ungreedy
3855
    quantifier can be useful for (*ACCEPT) - meaning "succeed on backtrack", a
3856
    sort of negated (*COMMIT). We therefore allow (*ACCEPT) to be quantified by
3857
    wrapping it in non-capturing brackets, but we have to allow for a preceding
3858
    (*MARK) for when (*ACCEPT) has an argument. */
3859
3860
115k
    if (*prev_parsed_item == META_ACCEPT)
3861
0
      {
3862
0
      uint32_t *p;
3863
0
      for (p = parsed_pattern - 1; p >= verbstartptr; p--) p[1] = p[0];
3864
0
      *verbstartptr = META_NOCAPTURE;
3865
0
      parsed_pattern[1] = META_KET;
3866
0
      parsed_pattern += 2;
3867
3868
#ifdef PCRE2_DEBUG
3869
      PCRE2_ASSERT(parsed_pattern_extra >= 2);
3870
      parsed_pattern_extra -= 2;
3871
#endif
3872
0
      }
3873
3874
    /* Now we can put the quantifier into the parsed pattern vector. At this
3875
    stage, we have only the basic quantifier. The check for a following + or ?
3876
    modifier happens at the top of the loop, after any intervening comments
3877
    have been removed. */
3878
3879
115k
    *parsed_pattern++ = meta_quantifier;
3880
115k
    if (c == CHAR_LEFT_CURLY_BRACKET)
3881
8.61k
      {
3882
8.61k
      *parsed_pattern++ = min_repeat;
3883
8.61k
      *parsed_pattern++ = max_repeat;
3884
8.61k
      }
3885
115k
    break;
3886
3887
3888
    /* ---- Character class ---- */
3889
3890
8.97k
    case CHAR_LEFT_SQUARE_BRACKET:
3891
3892
    /* In another (POSIX) regex library, the ugly syntax [[:<:]] and [[:>:]] is
3893
    used for "start of word" and "end of word". As these are otherwise illegal
3894
    sequences, we don't break anything by recognizing them. They are replaced
3895
    by \b(?=\w) and \b(?<=\w) respectively. Sequences like [a[:<:]] are
3896
    erroneous and are handled by the normal code below. */
3897
3898
8.97k
    if (ptrend - ptr >= 6 &&
3899
8.31k
         (PRIV(strncmp_c8)(ptr, STRING_WEIRD_STARTWORD, 6) == 0 ||
3900
8.19k
          PRIV(strncmp_c8)(ptr, STRING_WEIRD_ENDWORD, 6) == 0))
3901
280
      {
3902
280
      *parsed_pattern++ = META_ESCAPE + ESC_b;
3903
3904
280
      if (ptr[2] == CHAR_LESS_THAN_SIGN)
3905
115
        {
3906
115
        *parsed_pattern++ = META_LOOKAHEAD;
3907
115
        }
3908
165
      else
3909
165
        {
3910
165
        *parsed_pattern++ = META_LOOKBEHIND;
3911
165
        *has_lookbehind = TRUE;
3912
3913
        /* The offset is used only for the "non-fixed length" error; this won't
3914
        occur here, so just store zero. */
3915
3916
165
        PUTOFFSET((PCRE2_SIZE)0, parsed_pattern);
3917
165
        }
3918
3919
280
      if ((options & PCRE2_UCP) == 0)
3920
280
        *parsed_pattern++ = META_ESCAPE + ESC_w;
3921
0
      else
3922
0
        {
3923
0
        *parsed_pattern++ = META_ESCAPE + ESC_p;
3924
0
        *parsed_pattern++ = PT_WORD << 16;
3925
0
        }
3926
280
      *parsed_pattern++ = META_KET;
3927
280
      ptr += 6;
3928
280
      okquantifier = TRUE;
3929
280
      break;
3930
280
      }
3931
3932
    /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3933
    they are encountered at the top level, so we'll do that too. */
3934
3935
8.69k
    if (ptr < ptrend && (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
3936
8.21k
         *ptr == CHAR_EQUALS_SIGN) &&
3937
624
        check_posix_syntax(ptr, ptrend, &tempptr))
3938
2
      {
3939
2
      errorcode = (*ptr-- == CHAR_COLON)? ERR12 : ERR13;
3940
2
      ptr = tempptr + 2;
3941
2
      goto FAILED;
3942
2
      }
3943
3944
8.69k
    class_mode_state = ((options & PCRE2_ALT_EXTENDED_CLASS) != 0)?
3945
8.69k
        CLASS_MODE_ALT_EXT : CLASS_MODE_NORMAL;
3946
3947
    /* Jump here from '(?[...])'. That jump must initialize class_mode_state,
3948
    set c to the '[' character, and ptr to just after the '['. */
3949
3950
8.69k
    FROM_PERL_EXTENDED_CLASS:
3951
8.69k
    okquantifier = TRUE;
3952
3953
    /* In an EBCDIC environment, Perl treats alphabetic ranges specially
3954
    because there are holes in the encoding, and simply using the range A-Z
3955
    (for example) would include the characters in the holes. This applies only
3956
    to ranges where both values are literal; [\xC1-\xE9] is different to [A-Z]
3957
    in this respect. In order to accommodate this, we keep track of whether
3958
    character values are literal or not, and a state variable for handling
3959
    ranges. */
3960
3961
    /* Loop for the contents of the class. Classes may be nested, if
3962
    PCRE2_ALT_EXTENDED_CLASS is set, or the class is of the form (?[...]). */
3963
3964
    /* c is still set to '[' so the loop will handle the start of the class. */
3965
3966
8.69k
    class_depth_m1 = -1;
3967
8.69k
    class_maxdepth_m1 = -1;
3968
8.69k
    class_range_state = RANGE_NO;
3969
8.69k
    class_op_state = CLASS_OP_EMPTY;
3970
8.69k
    class_start = NULL;
3971
3972
8.69k
    for (;;)
3973
92.1k
      {
3974
92.1k
      BOOL char_is_literal = TRUE;
3975
3976
      /* Inside \Q...\E everything is literal except \E */
3977
3978
92.1k
      if (inescq)
3979
1.51k
        {
3980
1.51k
        if (c == CHAR_BACKSLASH && ptr < ptrend && *ptr == CHAR_E)
3981
592
          {
3982
592
          inescq = FALSE;                   /* Reset literal state */
3983
592
          ptr++;                            /* Skip the 'E' */
3984
592
          goto CLASS_CONTINUE;
3985
592
          }
3986
3987
        /* Surprisingly, you cannot use \Q..\E to escape a character inside a
3988
        Perl extended class. However, empty \Q\E sequences are allowed, so here
3989
        were're only giving an error if the \Q..\E is non-empty. */
3990
3991
927
        if (class_mode_state == CLASS_MODE_PERL_EXT)
3992
0
          {
3993
0
          errorcode = ERR116;
3994
0
          goto FAILED;
3995
0
          }
3996
3997
927
        goto CLASS_LITERAL;
3998
927
        }
3999
4000
      /* Skip over space and tab (only) in extended-more mode, or anywhere
4001
      inside a Perl extended class (which implies /xx). */
4002
4003
90.6k
      if ((c == CHAR_SPACE || c == CHAR_HT) &&
4004
147
          ((options & PCRE2_EXTENDED_MORE) != 0 ||
4005
147
           class_mode_state >= CLASS_MODE_PERL_EXT))
4006
0
        goto CLASS_CONTINUE;
4007
4008
      /* Handle POSIX class names. Perl allows a negation extension of the
4009
      form [:^name:]. A square bracket that doesn't match the syntax is
4010
      treated as a literal. We also recognize the POSIX constructions
4011
      [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
4012
      5.6 and 5.8 do. */
4013
4014
90.6k
      if (class_depth_m1 >= 0 &&
4015
81.9k
          c == CHAR_LEFT_SQUARE_BRACKET &&
4016
10.4k
          ptrend - ptr >= 3 &&
4017
10.2k
          (*ptr == CHAR_COLON || *ptr == CHAR_DOT ||
4018
5.21k
           *ptr == CHAR_EQUALS_SIGN) &&
4019
5.37k
          check_posix_syntax(ptr, ptrend, &tempptr))
4020
4.41k
        {
4021
4.41k
        BOOL posix_negate = FALSE;
4022
4.41k
        int posix_class;
4023
4024
        /* Perl treats a hyphen before a POSIX class as a literal, not the
4025
        start of a range. However, it gives a warning in its warning mode. PCRE
4026
        does not have a warning mode, so we give an error, because this is
4027
        likely an error on the user's part. */
4028
4029
4.41k
        if (class_range_state == RANGE_STARTED)
4030
1
          {
4031
1
          ptr = tempptr + 2;
4032
1
          errorcode = ERR50;
4033
1
          goto FAILED;
4034
1
          }
4035
4036
        /* Perl treats a hyphen after a POSIX class as a literal, not the
4037
        start of a range. However, it gives a warning in its warning mode
4038
        unless the hyphen is the last character in the class. PCRE does not
4039
        have a warning mode, so we give an error, because this is likely an
4040
        error on the user's part.
4041
4042
        Roll back to the hyphen for the error position. */
4043
4044
4.41k
        if (class_range_state == RANGE_FORBID_STARTED)
4045
0
          {
4046
0
          ptr = class_range_forbid_ptr;
4047
0
          errorcode = ERR50;
4048
0
          goto FAILED;
4049
0
          }
4050
4051
        /* Disallow implicit union in Perl extended classes. */
4052
4053
4.41k
        if (class_op_state == CLASS_OP_OPERAND &&
4054
3.71k
            class_mode_state == CLASS_MODE_PERL_EXT)
4055
0
          {
4056
0
          ptr = tempptr + 2;
4057
0
          errorcode = ERR113;
4058
0
          goto FAILED;
4059
0
          }
4060
4061
4.41k
        if (*ptr != CHAR_COLON)
4062
1
          {
4063
1
          ptr = tempptr + 2;
4064
1
          errorcode = ERR13;
4065
1
          goto FAILED;
4066
1
          }
4067
4068
4.41k
        if (*(++ptr) == CHAR_CIRCUMFLEX_ACCENT)
4069
106
          {
4070
106
          posix_negate = TRUE;
4071
106
          ptr++;
4072
106
          }
4073
4074
4.41k
        posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
4075
4.41k
        ptr = tempptr + 2;
4076
4.41k
        if (posix_class < 0)
4077
103
          {
4078
103
          errorcode = ERR30;
4079
103
          goto FAILED;
4080
103
          }
4081
4082
        /* Set "a hyphen is forbidden to be the start of a range". For the '-]'
4083
        case, the hyphen is treated as a literal, but for '-1' it is disallowed
4084
        (because it would be interpreted as range). */
4085
4086
4.30k
        class_range_state = RANGE_FORBID_NO;
4087
4.30k
        class_op_state = CLASS_OP_OPERAND;
4088
4089
        /* When PCRE2_UCP is set, unless PCRE2_EXTRA_ASCII_POSIX is set, some
4090
        of the POSIX classes are converted to use Unicode properties \p or \P
4091
        or, in one case, \h or \H. The substitutes table has two values per
4092
        class, containing the type and value of a \p or \P item. The special
4093
        cases are specified with a negative type: a non-zero value causes \h or
4094
        \H to be used, and a zero value falls through to behave like a non-UCP
4095
        POSIX class. There are now also some extra options that force ASCII for
4096
        some classes. */
4097
4098
4.30k
#ifdef SUPPORT_UNICODE
4099
4.30k
        if ((options & PCRE2_UCP) != 0 &&
4100
0
            (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
4101
0
            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
4102
0
              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
4103
0
          {
4104
0
          int ptype = posix_substitutes[2*posix_class];
4105
0
          int pvalue = posix_substitutes[2*posix_class + 1];
4106
4107
0
          if (ptype >= 0)
4108
0
            {
4109
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_P : ESC_p);
4110
0
            *parsed_pattern++ = (ptype << 16) | pvalue;
4111
0
            goto CLASS_CONTINUE;
4112
0
            }
4113
4114
0
          if (pvalue != 0)
4115
0
            {
4116
0
            *parsed_pattern++ = META_ESCAPE + (posix_negate? ESC_H : ESC_h);
4117
0
            goto CLASS_CONTINUE;
4118
0
            }
4119
4120
          /* Fall through */
4121
0
          }
4122
4.30k
#endif  /* SUPPORT_UNICODE */
4123
4124
        /* Non-UCP POSIX class */
4125
4126
4.30k
        *parsed_pattern++ = posix_negate? META_POSIX_NEG : META_POSIX;
4127
4.30k
        *parsed_pattern++ = posix_class;
4128
4.30k
        }
4129
4130
      /* Check for the start of the outermost class, or the start of a nested class. */
4131
4132
86.2k
      else if ((c == CHAR_LEFT_SQUARE_BRACKET &&
4133
14.7k
                (class_depth_m1 < 0 || class_mode_state == CLASS_MODE_ALT_EXT ||
4134
6.01k
                 class_mode_state == CLASS_MODE_PERL_EXT)) ||
4135
77.5k
               (c == CHAR_LEFT_PARENTHESIS &&
4136
1.02k
                class_mode_state == CLASS_MODE_PERL_EXT))
4137
8.69k
        {
4138
8.69k
        uint32_t start_c = c;
4139
8.69k
        uint32_t new_class_mode_state;
4140
4141
        /* Update the class mode, if moving into a 'leaf' inside a Perl extended
4142
        class. */
4143
4144
8.69k
        if (start_c == CHAR_LEFT_SQUARE_BRACKET &&
4145
8.69k
            class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 >= 0)
4146
0
          new_class_mode_state = CLASS_MODE_PERL_EXT_LEAF;
4147
8.69k
        else
4148
8.69k
          new_class_mode_state = class_mode_state;
4149
4150
        /* Tidy up the other class before starting the nested class. */
4151
        /* -[ beginning a nested class is a literal '-' */
4152
4153
8.69k
        if (class_range_state == RANGE_STARTED)
4154
0
          parsed_pattern[-1] = CHAR_MINUS;
4155
4156
        /* Disallow implicit union in Perl extended classes. */
4157
4158
8.69k
        if (class_op_state == CLASS_OP_OPERAND &&
4159
0
            class_mode_state == CLASS_MODE_PERL_EXT)
4160
0
          {
4161
0
          errorcode = ERR113;
4162
0
          goto FAILED;
4163
0
          }
4164
4165
        /* Validate nesting depth */
4166
8.69k
        if (class_depth_m1 >= ECLASS_NEST_LIMIT - 1)
4167
0
          {
4168
0
          ptr--;  /* Point rightwards at the paren, same as ERR19. */
4169
0
          errorcode = ERR107;  /* Classes too deeply nested */
4170
0
          goto FAILED;
4171
0
          }
4172
4173
        /* Process the character class start. If the first character is '^', set
4174
        the negation flag. If the first few characters (either before or after ^)
4175
        are \Q\E or \E or space or tab in extended-more mode, we skip them too.
4176
        This makes for compatibility with Perl. */
4177
4178
8.69k
        negate_class = FALSE;
4179
8.69k
        for (;;)
4180
10.2k
          {
4181
10.2k
          if (ptr >= ptrend)
4182
26
            {
4183
26
            if (start_c == CHAR_LEFT_PARENTHESIS)
4184
0
              errorcode = ERR14;  /* Missing terminating ')' */
4185
26
            else
4186
26
              errorcode = ERR6;   /* Missing terminating ']' */
4187
26
            goto FAILED;
4188
26
            }
4189
4190
10.2k
          GETCHARINCTEST(c, ptr);
4191
10.2k
          if (new_class_mode_state == CLASS_MODE_PERL_EXT) break;
4192
10.2k
          else if (c == CHAR_BACKSLASH)
4193
1.09k
            {
4194
1.09k
            if (ptr < ptrend && *ptr == CHAR_E) ptr++;
4195
998
            else if (ptrend - ptr >= 3 &&
4196
913
                PRIV(strncmp_c8)(ptr, STR_Q STR_BACKSLASH STR_E, 3) == 0)
4197
99
              ptr += 3;
4198
899
            else
4199
899
              break;
4200
1.09k
            }
4201
9.16k
          else if ((c == CHAR_SPACE || c == CHAR_HT) &&  /* Note: just these two */
4202
7
                   ((options & PCRE2_EXTENDED_MORE) != 0 ||
4203
7
                    new_class_mode_state >= CLASS_MODE_PERL_EXT))
4204
0
            continue;
4205
9.16k
          else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
4206
1.39k
            negate_class = TRUE;
4207
7.77k
          else break;
4208
10.2k
          }
4209
4210
        /* Now the real contents of the class; c has the first "real" character.
4211
        Empty classes are permitted only if the option is set, and if it's not
4212
        a Perl-extended class. */
4213
4214
8.67k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4215
315
            (cb->external_options & PCRE2_ALLOW_EMPTY_CLASS) != 0 &&
4216
0
            new_class_mode_state < CLASS_MODE_PERL_EXT)
4217
0
          {
4218
0
          PCRE2_ASSERT(start_c == CHAR_LEFT_SQUARE_BRACKET);
4219
4220
0
          if (class_start != NULL)
4221
0
            {
4222
0
            PCRE2_ASSERT(class_depth_m1 >= 0);
4223
            /* Represents that the class is an extended class. */
4224
0
            *class_start |= CLASS_IS_ECLASS;
4225
0
            class_start = NULL;
4226
0
            }
4227
4228
0
          *parsed_pattern++ = negate_class? META_CLASS_EMPTY_NOT : META_CLASS_EMPTY;
4229
4230
          /* Leave nesting depth unchanged; but check for zero depth to handle the
4231
          very first (top-level) class being empty. */
4232
0
          if (class_depth_m1 < 0) break;
4233
4234
0
          class_range_state = RANGE_NO; /* for processing the containing class */
4235
0
          class_op_state = CLASS_OP_OPERAND;
4236
0
          goto CLASS_CONTINUE;
4237
0
          }
4238
4239
        /* Enter a non-empty class. */
4240
4241
8.67k
        if (class_start != NULL)
4242
0
          {
4243
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4244
          /* Represents that the class is an extended class. */
4245
0
          *class_start |= CLASS_IS_ECLASS;
4246
0
          class_start = NULL;
4247
0
          }
4248
4249
8.67k
        class_start = parsed_pattern;
4250
8.67k
        *parsed_pattern++ = negate_class? META_CLASS_NOT : META_CLASS;
4251
8.67k
        class_range_state = RANGE_NO;
4252
8.67k
        class_op_state = CLASS_OP_EMPTY;
4253
8.67k
        class_mode_state = new_class_mode_state;
4254
8.67k
        ++class_depth_m1;
4255
8.67k
        if (class_maxdepth_m1 < class_depth_m1)
4256
8.67k
          class_maxdepth_m1 = class_depth_m1;
4257
        /* Reset; no op seen yet at new depth. */
4258
8.67k
        cb->class_op_used[class_depth_m1] = 0;
4259
4260
        /* Implement the special start-of-class literal meaning of ']'. */
4261
8.67k
        if (c == CHAR_RIGHT_SQUARE_BRACKET &&
4262
315
            new_class_mode_state != CLASS_MODE_PERL_EXT)
4263
315
          {
4264
315
          class_range_state = RANGE_OK_LITERAL;
4265
315
          class_op_state = CLASS_OP_OPERAND;
4266
315
          PARSED_LITERAL(c, parsed_pattern);
4267
315
          goto CLASS_CONTINUE;
4268
315
          }
4269
4270
8.35k
        continue;  /* We have already loaded c with the next character */
4271
8.67k
        }
4272
4273
      /* Check for the end of the class. */
4274
4275
77.5k
      else if (c == CHAR_RIGHT_SQUARE_BRACKET ||
4276
69.3k
               (c == CHAR_RIGHT_PARENTHESIS && class_mode_state == CLASS_MODE_PERL_EXT))
4277
8.17k
        {
4278
        /* In Perl extended mode, the ']' can only be used to match the
4279
        opening '[', and ')' must match an opening parenthesis. */
4280
8.17k
        if (class_mode_state == CLASS_MODE_PERL_EXT)
4281
0
          {
4282
0
          if (c == CHAR_RIGHT_SQUARE_BRACKET && class_depth_m1 != 0)
4283
0
            {
4284
0
            errorcode = ERR14;
4285
0
            ptr--;  /* Correct the offset */
4286
0
            goto FAILED;
4287
0
            }
4288
0
          if (c == CHAR_RIGHT_PARENTHESIS && class_depth_m1 < 1)
4289
0
            {
4290
0
            errorcode = ERR22;
4291
0
            goto FAILED;
4292
0
            }
4293
0
          }
4294
4295
        /* Check no trailing operator. */
4296
8.17k
        if (class_op_state == CLASS_OP_OPERATOR)
4297
0
          {
4298
0
          errorcode = ERR110;
4299
0
          goto FAILED;
4300
0
          }
4301
4302
        /* Check no empty expression for Perl extended expressions. */
4303
8.17k
        if (class_mode_state == CLASS_MODE_PERL_EXT &&
4304
0
            class_op_state == CLASS_OP_EMPTY)
4305
0
          {
4306
0
          errorcode = ERR114;
4307
0
          goto FAILED;
4308
0
          }
4309
4310
        /* -] at the end of a class is a literal '-' */
4311
8.17k
        if (class_range_state == RANGE_STARTED)
4312
159
          parsed_pattern[-1] = CHAR_MINUS;
4313
4314
8.17k
        *parsed_pattern++ = META_CLASS_END;
4315
4316
8.17k
        if (--class_depth_m1 < 0)
4317
8.17k
          {
4318
          /* Check for and consume ')' after '(?[...]'. */
4319
8.17k
          PCRE2_ASSERT(class_mode_state != CLASS_MODE_PERL_EXT_LEAF);
4320
8.17k
          if (class_mode_state == CLASS_MODE_PERL_EXT)
4321
0
            {
4322
0
            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
4323
0
              {
4324
0
              errorcode = ERR115;
4325
0
              goto FAILED;
4326
0
              }
4327
4328
0
            ptr++;
4329
0
            }
4330
4331
8.17k
          break;
4332
8.17k
          }
4333
4334
0
        class_range_state = RANGE_NO; /* for processing the containing class */
4335
0
        class_op_state = CLASS_OP_OPERAND;
4336
0
        if (class_mode_state == CLASS_MODE_PERL_EXT_LEAF)
4337
0
          class_mode_state = CLASS_MODE_PERL_EXT;
4338
        /* The extended class flag has already
4339
        been set for the parent class. */
4340
0
        class_start = NULL;
4341
0
        }
4342
4343
      /* Handle a Perl set binary operator */
4344
4345
69.3k
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4346
0
               (c == CHAR_PLUS || c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4347
0
                c == CHAR_AMPERSAND || c == CHAR_CIRCUMFLEX_ACCENT))
4348
0
        {
4349
        /* Check that there was a preceding operand. */
4350
0
        if (class_op_state != CLASS_OP_OPERAND)
4351
0
          {
4352
0
          errorcode = ERR109;
4353
0
          goto FAILED;
4354
0
          }
4355
4356
0
        if (class_start != NULL)
4357
0
          {
4358
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4359
          /* Represents that the class is an extended class. */
4360
0
          *class_start |= CLASS_IS_ECLASS;
4361
0
          class_start = NULL;
4362
0
          }
4363
4364
0
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4365
0
                     class_range_state != RANGE_FORBID_STARTED);
4366
4367
0
        *parsed_pattern++ = c == CHAR_PLUS? META_ECLASS_OR :
4368
0
                            c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4369
0
                            c == CHAR_MINUS? META_ECLASS_SUB :
4370
0
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4371
0
                            META_ECLASS_XOR;
4372
0
        class_range_state = RANGE_NO;
4373
0
        class_op_state = CLASS_OP_OPERATOR;
4374
0
        }
4375
4376
      /* Handle a Perl set unary operator */
4377
4378
69.3k
      else if (class_mode_state == CLASS_MODE_PERL_EXT &&
4379
0
               c == CHAR_EXCLAMATION_MARK)
4380
0
        {
4381
        /* Check that the "!" has not got a preceding operand (i.e. it's the
4382
        start of the class, or follows an operator). */
4383
0
        if (class_op_state == CLASS_OP_OPERAND)
4384
0
          {
4385
0
          errorcode = ERR113;
4386
0
          goto FAILED;
4387
0
          }
4388
4389
0
        if (class_start != NULL)
4390
0
          {
4391
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4392
          /* Represents that the class is an extended class. */
4393
0
          *class_start |= CLASS_IS_ECLASS;
4394
0
          class_start = NULL;
4395
0
          }
4396
4397
0
        PCRE2_ASSERT(class_range_state != RANGE_STARTED &&
4398
0
                     class_range_state != RANGE_FORBID_STARTED);
4399
4400
0
        *parsed_pattern++ = META_ECLASS_NOT;
4401
0
        class_range_state = RANGE_NO;
4402
0
        class_op_state = CLASS_OP_OPERATOR;
4403
0
        }
4404
4405
      /* Handle a UTS#18 set operator */
4406
4407
69.3k
      else if (class_mode_state == CLASS_MODE_ALT_EXT &&
4408
0
               (c == CHAR_VERTICAL_LINE || c == CHAR_MINUS ||
4409
0
                c == CHAR_AMPERSAND || c == CHAR_TILDE) &&
4410
0
               ptr < ptrend && *ptr == c)
4411
0
        {
4412
0
        ++ptr;
4413
4414
        /* Check there isn't a triple-repetition. */
4415
0
        if (ptr < ptrend && *ptr == c)
4416
0
          {
4417
0
          while (ptr < ptrend && *ptr == c) ++ptr;  /* Improve error offset. */
4418
0
          errorcode = ERR108;
4419
0
          goto FAILED;
4420
0
          }
4421
4422
        /* Check for a preceding operand. */
4423
0
        if (class_op_state != CLASS_OP_OPERAND)
4424
0
          {
4425
0
          errorcode = ERR109;
4426
0
          goto FAILED;
4427
0
          }
4428
4429
        /* Check for mixed precedence. Forbid [A--B&&C]. */
4430
0
        if (cb->class_op_used[class_depth_m1] != 0 &&
4431
0
            cb->class_op_used[class_depth_m1] != (uint8_t)c)
4432
0
          {
4433
0
          errorcode = ERR111;
4434
0
          goto FAILED;
4435
0
          }
4436
4437
0
        if (class_start != NULL)
4438
0
          {
4439
0
          PCRE2_ASSERT(class_depth_m1 >= 0);
4440
          /* Represents that the class is an extended class. */
4441
0
          *class_start |= CLASS_IS_ECLASS;
4442
0
          class_start = NULL;
4443
0
          }
4444
4445
        /* Dangling '-' before an operator is a literal */
4446
0
        if (class_range_state == RANGE_STARTED)
4447
0
          parsed_pattern[-1] = CHAR_MINUS;
4448
4449
0
        *parsed_pattern++ = c == CHAR_VERTICAL_LINE? META_ECLASS_OR :
4450
0
                            c == CHAR_MINUS? META_ECLASS_SUB :
4451
0
                            c == CHAR_AMPERSAND? META_ECLASS_AND :
4452
0
                            META_ECLASS_XOR;
4453
0
        class_range_state = RANGE_NO;
4454
0
        class_op_state = CLASS_OP_OPERATOR;
4455
0
        cb->class_op_used[class_depth_m1] = (uint8_t)c;
4456
0
        }
4457
4458
      /* Handle escapes in a class */
4459
4460
69.3k
      else if (c == CHAR_BACKSLASH)
4461
5.86k
        {
4462
5.86k
        tempptr = ptr;
4463
5.86k
        escape = PRIV(check_escape)(&ptr, ptrend, &c, &errorcode, options,
4464
5.86k
          xoptions, cb->bracount, TRUE, cb);
4465
4466
5.86k
        if (errorcode != 0)
4467
19
          {
4468
19
          if ((xoptions & PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL) == 0 ||
4469
0
              class_mode_state >= CLASS_MODE_PERL_EXT)
4470
19
            goto FAILED;
4471
0
          ptr = tempptr;
4472
0
          if (ptr >= ptrend) c = CHAR_BACKSLASH; else
4473
0
            {
4474
0
            GETCHARINCTEST(c, ptr);   /* Get character value, increment pointer */
4475
0
            }
4476
0
          escape = 0;                 /* Treat as literal character */
4477
0
          }
4478
4479
5.84k
        switch(escape)
4480
5.84k
          {
4481
789
          case 0:  /* Escaped character code point is in c */
4482
789
          char_is_literal = FALSE;
4483
789
          goto CLASS_LITERAL;      /* (a few lines above) */
4484
4485
201
          case ESC_b:
4486
201
          c = CHAR_BS;    /* \b is backspace in a class */
4487
201
          char_is_literal = FALSE;
4488
201
          goto CLASS_LITERAL;
4489
4490
6
          case ESC_k:
4491
6
          c = CHAR_k;     /* \k is not special in a class, just like \g */
4492
6
          char_is_literal = FALSE;
4493
6
          goto CLASS_LITERAL;
4494
4495
631
          case ESC_Q:
4496
631
          inescq = TRUE;  /* Enter literal mode */
4497
631
          goto CLASS_CONTINUE;
4498
4499
309
          case ESC_E:     /* Ignore orphan \E */
4500
309
          goto CLASS_CONTINUE;
4501
4502
0
          case ESC_B:     /* Always an error in a class */
4503
15
          case ESC_R:
4504
16
          case ESC_X:
4505
16
          errorcode = ERR7;
4506
16
          goto FAILED;
4507
4508
2
          case ESC_N:     /* Not permitted by Perl either */
4509
2
          errorcode = ERR71;
4510
2
          goto FAILED;
4511
4512
154
          case ESC_H:
4513
1.48k
          case ESC_h:
4514
1.93k
          case ESC_V:
4515
2.04k
          case ESC_v:
4516
2.04k
          *parsed_pattern++ = META_ESCAPE + escape;
4517
2.04k
          break;
4518
4519
          /* These escapes may be converted to Unicode property tests when
4520
          PCRE2_UCP is set. */
4521
4522
100
          case ESC_d:
4523
685
          case ESC_D:
4524
1.33k
          case ESC_s:
4525
1.46k
          case ESC_S:
4526
1.57k
          case ESC_w:
4527
1.82k
          case ESC_W:
4528
1.82k
          parsed_pattern = handle_escdsw(escape, parsed_pattern, options,
4529
1.82k
            xoptions);
4530
1.82k
          break;
4531
4532
          /* Explicit Unicode property matching */
4533
4534
1
          case ESC_P:
4535
2
          case ESC_p:
4536
2
#ifdef SUPPORT_UNICODE
4537
2
            {
4538
2
            BOOL negated;
4539
2
            uint16_t ptype = 0, pdata = 0;
4540
2
            if (!get_ucp(&ptr, utf, &negated, &ptype, &pdata, &errorcode, cb))
4541
2
              goto FAILED;
4542
4543
            /* In caseless matching, particular characteristics Lu, Ll, and Lt
4544
            get converted to the general characteristic L&. That is, upper,
4545
            lower, and title case letters are all conflated. */
4546
4547
0
            if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
4548
0
                (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
4549
0
              {
4550
0
              ptype = PT_LAMP;
4551
0
              pdata = 0;
4552
0
              }
4553
4554
0
            if (negated) escape = (escape == ESC_P)? ESC_p : ESC_P;
4555
0
            *parsed_pattern++ = META_ESCAPE + escape;
4556
0
            *parsed_pattern++ = (ptype << 16) | pdata;
4557
0
            }
4558
#else
4559
          errorcode = ERR45;
4560
          goto FAILED;
4561
#endif
4562
0
          break;  /* End \P and \p */
4563
4564
          /* All others are not allowed in a class */
4565
4566
          /* LCOV_EXCL_START */
4567
0
          default:
4568
0
          PCRE2_DEBUG_UNREACHABLE();
4569
0
          PCRE2_FALLTHROUGH /* Fall through */
4570
          /* LCOV_EXCL_STOP */
4571
4572
9
          case ESC_A:
4573
11
          case ESC_Z:
4574
13
          case ESC_z:
4575
16
          case ESC_G:
4576
17
          case ESC_K:
4577
21
          case ESC_C:
4578
21
          errorcode = ERR7;
4579
21
          goto FAILED;
4580
5.84k
          }
4581
4582
        /* All the switch-cases above which end in "break" describe a set
4583
        of characters. None may start a range. */
4584
4585
        /* The second part of a range can be a single-character escape
4586
        sequence (detected above), but not any of the other escapes. Perl
4587
        treats a hyphen as a literal in such circumstances. However, in Perl's
4588
        warning mode, a warning is given, so PCRE now faults it, as it is
4589
        almost certainly a mistake on the user's part. */
4590
4591
3.86k
        if (class_range_state == RANGE_STARTED)
4592
0
          {
4593
0
          errorcode = ERR50;
4594
0
          goto FAILED;
4595
0
          }
4596
4597
        /* Perl gives a warning unless the hyphen following a multi-character
4598
        escape is the last character in the class. PCRE throws an error. */
4599
4600
3.86k
        if (class_range_state == RANGE_FORBID_STARTED)
4601
0
          {
4602
0
          ptr = class_range_forbid_ptr;
4603
0
          errorcode = ERR50;
4604
0
          goto FAILED;
4605
0
          }
4606
4607
        /* Disallow implicit union in Perl extended classes. */
4608
4609
3.86k
        if (class_op_state == CLASS_OP_OPERAND &&
4610
3.13k
            class_mode_state == CLASS_MODE_PERL_EXT)
4611
0
          {
4612
0
          errorcode = ERR113;
4613
0
          goto FAILED;
4614
0
          }
4615
4616
3.86k
        class_range_state = RANGE_FORBID_NO;
4617
3.86k
        class_op_state = CLASS_OP_OPERAND;
4618
3.86k
        }
4619
4620
      /* Forbid unescaped literals, and the special meaning of '-', inside a
4621
      Perl extended class. */
4622
4623
63.4k
      else if (class_mode_state == CLASS_MODE_PERL_EXT)
4624
0
        {
4625
0
        errorcode = ERR116;
4626
0
        goto FAILED;
4627
0
        }
4628
4629
      /* Handle potential start of range */
4630
4631
63.4k
      else if (c == CHAR_MINUS && class_range_state >= RANGE_OK_ESCAPED)
4632
2.07k
        {
4633
2.07k
        *parsed_pattern++ = (class_range_state == RANGE_OK_LITERAL)?
4634
2.07k
          META_RANGE_LITERAL : META_RANGE_ESCAPED;
4635
2.07k
        class_range_state = RANGE_STARTED;
4636
2.07k
        }
4637
4638
      /* Handle forbidden start of range */
4639
4640
61.4k
      else if (c == CHAR_MINUS && class_range_state == RANGE_FORBID_NO)
4641
0
        {
4642
0
        *parsed_pattern++ = CHAR_MINUS;
4643
0
        class_range_state = RANGE_FORBID_STARTED;
4644
0
        class_range_forbid_ptr = ptr;
4645
0
        }
4646
4647
      /* Handle a literal character */
4648
4649
61.4k
      else
4650
61.4k
        {
4651
63.3k
        CLASS_LITERAL:
4652
4653
        /* Disallow implicit union in Perl extended classes. */
4654
4655
63.3k
        if (class_op_state == CLASS_OP_OPERAND &&
4656
56.4k
            class_mode_state == CLASS_MODE_PERL_EXT)
4657
0
          {
4658
0
          errorcode = ERR113;
4659
0
          goto FAILED;
4660
0
          }
4661
4662
63.3k
        if (class_range_state == RANGE_STARTED)
4663
1.89k
          {
4664
1.89k
          if (c == parsed_pattern[-2])       /* Optimize one-char range */
4665
124
            parsed_pattern--;
4666
1.77k
          else if (parsed_pattern[-2] > c)   /* Check range is in order */
4667
7
            {
4668
7
            errorcode = ERR8;
4669
7
            goto FAILED;
4670
7
            }
4671
1.76k
          else
4672
1.76k
            {
4673
1.76k
            if (!char_is_literal && parsed_pattern[-1] == META_RANGE_LITERAL)
4674
204
              parsed_pattern[-1] = META_RANGE_ESCAPED;
4675
1.76k
            PARSED_LITERAL(c, parsed_pattern);
4676
1.76k
            }
4677
1.89k
          class_range_state = RANGE_NO;
4678
1.89k
          class_op_state = CLASS_OP_OPERAND;
4679
1.89k
          }
4680
61.4k
        else if (class_range_state == RANGE_FORBID_STARTED)
4681
0
          {
4682
0
          ptr = class_range_forbid_ptr;
4683
0
          errorcode = ERR50;
4684
0
          goto FAILED;
4685
0
          }
4686
61.4k
        else  /* Potential start of range */
4687
61.4k
          {
4688
61.4k
          class_range_state = char_is_literal?
4689
60.6k
            RANGE_OK_LITERAL : RANGE_OK_ESCAPED;
4690
61.4k
          class_op_state = CLASS_OP_OPERAND;
4691
61.4k
          PARSED_LITERAL(c, parsed_pattern);
4692
61.4k
          }
4693
63.3k
        }
4694
4695
      /* Proceed to next thing in the class. */
4696
4697
75.4k
      CLASS_CONTINUE:
4698
75.4k
      if (ptr >= ptrend)
4699
322
        {
4700
322
        if (class_mode_state == CLASS_MODE_PERL_EXT && class_depth_m1 > 0)
4701
0
          errorcode = ERR14;   /* Missing terminating ')' */
4702
322
        if (class_mode_state == CLASS_MODE_ALT_EXT &&
4703
0
            class_depth_m1 == 0 && class_maxdepth_m1 == 1)
4704
0
          errorcode = ERR112;  /* Missing terminating ']', but we saw '[ [ ]...' */
4705
322
        else
4706
322
          errorcode = ERR6;    /* Missing terminating ']' */
4707
322
        goto FAILED;
4708
322
        }
4709
75.1k
      GETCHARINCTEST(c, ptr);
4710
75.1k
      }     /* End of class-processing loop */
4711
4712
8.17k
    break;  /* End of character class */
4713
4714
4715
    /* ---- Opening parenthesis ---- */
4716
4717
432k
    case CHAR_LEFT_PARENTHESIS:
4718
432k
    if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4719
4720
    /* If ( is not followed by ? it is either a capture or a special verb or an
4721
    alpha assertion or a positive non-atomic lookahead. */
4722
4723
432k
    if (*ptr != CHAR_QUESTION_MARK)
4724
397k
      {
4725
397k
      const char *vn;
4726
4727
      /* Handle capturing brackets (or non-capturing if auto-capture is turned
4728
      off). */
4729
4730
397k
      if (*ptr != CHAR_ASTERISK)
4731
394k
        {
4732
394k
        nest_depth++;
4733
394k
        if ((options & PCRE2_NO_AUTO_CAPTURE) == 0)
4734
394k
          {
4735
394k
          if (cb->bracount >= MAX_GROUP_NUMBER)
4736
1
            {
4737
1
            errorcode = ERR97;
4738
1
            goto FAILED;
4739
1
            }
4740
394k
          cb->bracount++;
4741
394k
          *parsed_pattern++ = META_CAPTURE | cb->bracount;
4742
394k
          }
4743
0
        else *parsed_pattern++ = META_NOCAPTURE;
4744
394k
        }
4745
4746
      /* Do nothing for (* followed by end of pattern or ) so it gives a "bad
4747
      quantifier" error rather than "(*MARK) must have an argument". */
4748
4749
2.17k
      else if (ptrend - ptr <= 1 || (c = ptr[1]) == CHAR_RIGHT_PARENTHESIS)
4750
3
        break;
4751
4752
      /* Handle "alpha assertions" such as (*pla:...). Most of these are
4753
      synonyms for the historical symbolic assertions, but the script run and
4754
      non-atomic lookaround ones are new. They are distinguished by starting
4755
      with a lower case letter. Checking both ends of the alphabet makes this
4756
      work in all character codes. */
4757
4758
2.16k
      else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)
4759
3
        {
4760
3
        uint32_t meta;
4761
4762
3
        vn = alasnames;
4763
3
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4764
3
          &errorcode, cb)) goto FAILED;
4765
1
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4766
1
        if (*ptr != CHAR_COLON)
4767
1
          {
4768
1
          errorcode = ERR95;  /* Malformed */
4769
1
          goto FAILED_FORWARD;
4770
1
          }
4771
4772
        /* Scan the table of alpha assertion names */
4773
4774
0
        for (i = 0; i < alascount; i++)
4775
0
          {
4776
0
          if (namelen == alasmeta[i].len &&
4777
0
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4778
0
            break;
4779
0
          vn += alasmeta[i].len + 1;
4780
0
          }
4781
4782
0
        if (i >= alascount)
4783
0
          {
4784
0
          errorcode = ERR95;  /* Alpha assertion not recognized */
4785
0
          goto FAILED;
4786
0
          }
4787
4788
        /* Check for expecting an assertion condition. If so, only atomic
4789
        lookaround assertions are valid. */
4790
4791
0
        meta = alasmeta[i].meta;
4792
0
        if (prev_expect_cond_assert > 0 &&
4793
0
            (meta < META_LOOKAHEAD || meta > META_LOOKBEHINDNOT))
4794
0
          {
4795
0
          errorcode = ERR28;  /* Atomic assertion expected */
4796
0
          goto FAILED;
4797
0
          }
4798
4799
        /* The lookaround alphabetic synonyms can mostly be handled by jumping
4800
        to the code that handles the traditional symbolic forms. */
4801
4802
0
        switch(meta)
4803
0
          {
4804
          /* LCOV_EXCL_START */
4805
0
          default:
4806
0
          PCRE2_DEBUG_UNREACHABLE();
4807
0
          errorcode = ERR89;  /* Unknown code; should never occur because */
4808
0
          goto FAILED;        /* the meta values come from a table above. */
4809
          /* LCOV_EXCL_STOP */
4810
4811
0
          case META_ATOMIC:
4812
0
          goto ATOMIC_GROUP;
4813
4814
0
          case META_LOOKAHEAD:
4815
0
          goto POSITIVE_LOOK_AHEAD;
4816
4817
0
          case META_LOOKAHEAD_NA:
4818
0
          goto POSITIVE_NONATOMIC_LOOK_AHEAD;
4819
4820
0
          case META_LOOKAHEADNOT:
4821
0
          goto NEGATIVE_LOOK_AHEAD;
4822
4823
0
          case META_SCS:
4824
0
          ptr++;
4825
0
          *parsed_pattern++ = META_SCS;
4826
4827
0
          parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
4828
0
                                              0, &errorcode, cb);
4829
0
          if (parsed_pattern == NULL) goto FAILED;
4830
0
          goto POST_ASSERTION;
4831
4832
0
          case META_LOOKBEHIND:
4833
0
          case META_LOOKBEHINDNOT:
4834
0
          case META_LOOKBEHIND_NA:
4835
0
          *parsed_pattern++ = meta;
4836
0
          ptr--;
4837
0
          goto POST_LOOKBEHIND;
4838
4839
          /* The script run facilities are handled here. Unicode support is
4840
          required (give an error if not, as this is a security issue). Always
4841
          record a META_SCRIPT_RUN item. Then, for the atomic version, insert
4842
          META_ATOMIC and remember that we need two META_KETs at the end. */
4843
4844
0
          case META_SCRIPT_RUN:
4845
0
          case META_ATOMIC_SCRIPT_RUN:
4846
0
#ifdef SUPPORT_UNICODE
4847
0
          *parsed_pattern++ = META_SCRIPT_RUN;
4848
0
          nest_depth++;
4849
0
          ptr++;
4850
0
          if (meta == META_ATOMIC_SCRIPT_RUN)
4851
0
            {
4852
0
            *parsed_pattern++ = META_ATOMIC;
4853
0
            if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4854
0
            else if (++top_nest >= end_nests)
4855
0
              {
4856
0
              errorcode = ERR84;
4857
0
              goto FAILED;
4858
0
              }
4859
0
            top_nest->nest_depth = nest_depth;
4860
0
            top_nest->flags = NSF_ATOMICSR;
4861
0
            top_nest->options = options & PARSE_TRACKED_OPTIONS;
4862
0
            top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
4863
4864
#ifdef PCRE2_DEBUG
4865
            /* We'll write out two META_KETs for a single ")" in the input
4866
            pattern, so we reserve space for that in our bounds check. */
4867
            parsed_pattern_extra++;
4868
#endif
4869
0
            }
4870
0
          break;
4871
#else  /* SUPPORT_UNICODE */
4872
          errorcode = ERR96;
4873
          goto FAILED;
4874
#endif
4875
0
          }
4876
0
        }
4877
4878
4879
      /* ---- Handle (*VERB) and (*VERB:NAME) ---- */
4880
4881
2.16k
      else
4882
2.16k
        {
4883
2.16k
        vn = verbnames;
4884
2.16k
        if (!read_name(&ptr, ptrend, utf, 0, &offset, &name, &namelen,
4885
2.16k
          &errorcode, cb)) goto FAILED;
4886
2.15k
        if (ptr >= ptrend || (*ptr != CHAR_COLON &&
4887
1.14k
                              *ptr != CHAR_RIGHT_PARENTHESIS))
4888
682
          {
4889
682
          errorcode = ERR60;  /* Malformed */
4890
682
          goto FAILED;
4891
682
          }
4892
4893
        /* Scan the table of verb names */
4894
4895
8.01k
        for (i = 0; i < verbcount; i++)
4896
7.95k
          {
4897
7.95k
          if (namelen == verbs[i].len &&
4898
2.81k
              PRIV(strncmp_c8)(name, vn, namelen) == 0)
4899
1.42k
            break;
4900
6.53k
          vn += verbs[i].len + 1;
4901
6.53k
          }
4902
4903
1.47k
        if (i >= verbcount)
4904
51
          {
4905
51
          errorcode = ERR60;  /* Verb not recognized */
4906
51
          goto FAILED;
4907
51
          }
4908
4909
        /* An empty argument is treated as no argument. */
4910
4911
1.42k
        if (*ptr == CHAR_COLON && ptr + 1 < ptrend &&
4912
737
             ptr[1] == CHAR_RIGHT_PARENTHESIS)
4913
1
          ptr++;    /* Advance to the closing parens */
4914
4915
        /* Check for mandatory non-empty argument; this is (*MARK) */
4916
4917
1.42k
        if (verbs[i].has_arg > 0 && *ptr != CHAR_COLON)
4918
3
          {
4919
3
          errorcode = ERR66;
4920
3
          goto FAILED;
4921
3
          }
4922
4923
        /* Remember where this verb, possibly with a preceding (*MARK), starts,
4924
        for handling quantified (*ACCEPT). */
4925
4926
1.42k
        verbstartptr = parsed_pattern;
4927
1.42k
        okquantifier = (verbs[i].meta == META_ACCEPT);
4928
#ifdef PCRE2_DEBUG
4929
        /* Reserve space in our bounds check for optionally wrapping the (*ACCEPT)
4930
        with a non-capturing bracket, if there is a following quantifier. */
4931
        if (okquantifier) parsed_pattern_extra += 2;
4932
#endif
4933
4934
        /* It appears that Perl allows any characters whatsoever, other than a
4935
        closing parenthesis, to appear in arguments ("names"), so we no longer
4936
        insist on letters, digits, and underscores. Perl does not, however, do
4937
        any interpretation within arguments, and has no means of including a
4938
        closing parenthesis. PCRE supports escape processing but only when it
4939
        is requested by an option. We set inverbname TRUE here, and let the
4940
        main loop take care of this so that escape and \x processing is done by
4941
        the main code above. */
4942
4943
1.42k
        if (*ptr++ == CHAR_COLON)   /* Skip past : or ) */
4944
738
          {
4945
          /* Some optional arguments can be treated as a preceding (*MARK) */
4946
4947
738
          if (verbs[i].has_arg < 0)
4948
2
            {
4949
2
            add_after_mark = verbs[i].meta;
4950
2
            *parsed_pattern++ = META_MARK;
4951
2
            }
4952
4953
          /* The remaining verbs with arguments (except *MARK) need a different
4954
          opcode. */
4955
4956
736
          else
4957
736
            {
4958
736
            *parsed_pattern++ = verbs[i].meta +
4959
736
              ((verbs[i].meta != META_MARK)? 0x00010000u:0);
4960
736
            }
4961
4962
          /* Set up for reading the name in the main loop. */
4963
4964
738
          verblengthptr = parsed_pattern++;
4965
738
          verbnamestart = ptr;
4966
738
          inverbname = TRUE;
4967
738
          }
4968
685
        else  /* No verb "name" argument */
4969
685
          {
4970
685
          *parsed_pattern++ = verbs[i].meta;
4971
685
          }
4972
1.42k
        }     /* End of (*VERB) handling */
4973
396k
      break;  /* Done with this parenthesis */
4974
397k
      }       /* End of groups that don't start with (? */
4975
4976
4977
    /* ---- Items starting (? ---- */
4978
4979
    /* The type of item is determined by what follows (?. Handle (?| and option
4980
    changes under "default" because both need a new block on the nest stack.
4981
    Comments starting with (?# are handled above. Note that there is some
4982
    ambiguity about the sequence (?- because if a digit follows it's a relative
4983
    recursion or subroutine call whereas otherwise it's an option unsetting. */
4984
4985
35.0k
    if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4986
4987
35.0k
    switch(*ptr)
4988
35.0k
      {
4989
3.80k
      default:
4990
3.80k
      if (*ptr == CHAR_MINUS && ptrend - ptr > 1 && IS_DIGIT(ptr[1]))
4991
61
        goto RECURSION_BYNUMBER;  /* The + case is handled by CHAR_PLUS */
4992
4993
      /* We now have either (?| or a (possibly empty) option setting,
4994
      optionally followed by a non-capturing group. */
4995
4996
3.74k
      nest_depth++;
4997
3.74k
      if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
4998
2.03k
      else if (++top_nest >= end_nests)
4999
0
        {
5000
0
        errorcode = ERR84;
5001
0
        goto FAILED;
5002
0
        }
5003
3.74k
      top_nest->nest_depth = nest_depth;
5004
3.74k
      top_nest->flags = 0;
5005
3.74k
      top_nest->options = options & PARSE_TRACKED_OPTIONS;
5006
3.74k
      top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5007
5008
      /* Start of non-capturing group that resets the capture count for each
5009
      branch. */
5010
5011
3.74k
      if (*ptr == CHAR_VERTICAL_LINE)
5012
552
        {
5013
552
        top_nest->reset_group = (uint16_t)cb->bracount;
5014
552
        top_nest->max_group = (uint16_t)cb->bracount;
5015
552
        top_nest->flags |= NSF_RESET;
5016
552
        cb->external_flags |= PCRE2_DUPCAPUSED;
5017
552
        *parsed_pattern++ = META_NOCAPTURE;
5018
552
        ptr++;
5019
552
        }
5020
5021
      /* Scan for options imnrsxJU to be set or unset. */
5022
5023
3.18k
      else
5024
3.18k
        {
5025
3.18k
        BOOL hyphenok = TRUE;
5026
3.18k
        uint32_t oldoptions = options;
5027
3.18k
        uint32_t oldxoptions = xoptions;
5028
5029
3.18k
        top_nest->reset_group = 0;
5030
3.18k
        top_nest->max_group = 0;
5031
3.18k
        set = unset = 0;
5032
3.18k
        optset = &set;
5033
3.18k
        xset = xunset = 0;
5034
3.18k
        xoptset = &xset;
5035
5036
        /* ^ at the start unsets irmnsx and disables the subsequent use of - */
5037
5038
3.18k
        if (ptr < ptrend && *ptr == CHAR_CIRCUMFLEX_ACCENT)
5039
0
          {
5040
0
          options &= ~(PCRE2_CASELESS|PCRE2_MULTILINE|PCRE2_NO_AUTO_CAPTURE|
5041
0
                       PCRE2_DOTALL|PCRE2_EXTENDED|PCRE2_EXTENDED_MORE);
5042
0
          xoptions &= ~(PCRE2_EXTRA_CASELESS_RESTRICT);
5043
0
          hyphenok = FALSE;
5044
0
          ptr++;
5045
0
          }
5046
5047
6.15k
        while (ptr < ptrend && *ptr != CHAR_RIGHT_PARENTHESIS &&
5048
4.76k
                               *ptr != CHAR_COLON)
5049
2.99k
          {
5050
2.99k
          switch (*ptr++)
5051
2.99k
            {
5052
280
            case CHAR_MINUS:
5053
280
            if (!hyphenok)
5054
7
              {
5055
7
              errorcode = ERR94;
5056
7
              goto FAILED;
5057
7
              }
5058
273
            optset = &unset;
5059
273
            xoptset = &xunset;
5060
273
            hyphenok = FALSE;
5061
273
            break;
5062
5063
            /* There are some two-character sequences that start with 'a'. */
5064
5065
0
            case CHAR_a:
5066
0
            if (ptr < ptrend)
5067
0
              {
5068
0
              if (*ptr == CHAR_D)
5069
0
                {
5070
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSD;
5071
0
                ptr++;
5072
0
                break;
5073
0
                }
5074
0
              if (*ptr == CHAR_P)
5075
0
                {
5076
0
                *xoptset |= (PCRE2_EXTRA_ASCII_POSIX|PCRE2_EXTRA_ASCII_DIGIT);
5077
0
                ptr++;
5078
0
                break;
5079
0
                }
5080
0
              if (*ptr == CHAR_S)
5081
0
                {
5082
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSS;
5083
0
                ptr++;
5084
0
                break;
5085
0
                }
5086
0
              if (*ptr == CHAR_T)
5087
0
                {
5088
0
                *xoptset |= PCRE2_EXTRA_ASCII_DIGIT;
5089
0
                ptr++;
5090
0
                break;
5091
0
                }
5092
0
              if (*ptr == CHAR_W)
5093
0
                {
5094
0
                *xoptset |= PCRE2_EXTRA_ASCII_BSW;
5095
0
                ptr++;
5096
0
                break;
5097
0
                }
5098
0
              }
5099
0
            *xoptset |= PCRE2_EXTRA_ASCII_BSD|PCRE2_EXTRA_ASCII_BSS|
5100
0
                        PCRE2_EXTRA_ASCII_BSW|
5101
0
                        PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_ASCII_POSIX;
5102
0
            break;
5103
5104
963
            case CHAR_J:  /* Record that it changed in the external options */
5105
963
            *optset |= PCRE2_DUPNAMES;
5106
963
            cb->external_flags |= PCRE2_JCHANGED;
5107
963
            break;
5108
5109
642
            case CHAR_i: *optset |= PCRE2_CASELESS; break;
5110
172
            case CHAR_m: *optset |= PCRE2_MULTILINE; break;
5111
0
            case CHAR_n: *optset |= PCRE2_NO_AUTO_CAPTURE; break;
5112
0
            case CHAR_r: *xoptset|= PCRE2_EXTRA_CASELESS_RESTRICT; break;
5113
222
            case CHAR_s: *optset |= PCRE2_DOTALL; break;
5114
412
            case CHAR_U: *optset |= PCRE2_UNGREEDY; break;
5115
5116
            /* If x appears twice it sets the extended extended option. */
5117
5118
277
            case CHAR_x:
5119
277
            *optset |= PCRE2_EXTENDED;
5120
277
            if (ptr < ptrend && *ptr == CHAR_x)
5121
48
              {
5122
48
              *optset |= PCRE2_EXTENDED_MORE;
5123
48
              ptr++;
5124
48
              }
5125
277
            break;
5126
5127
31
            default:
5128
31
            errorcode = ERR11;
5129
31
            goto FAILED;
5130
2.99k
            }
5131
2.99k
          }
5132
5133
        /* If we are setting extended without extended-more, ensure that any
5134
        existing extended-more gets unset. Also, unsetting extended must also
5135
        unset extended-more. */
5136
5137
3.15k
        if ((set & (PCRE2_EXTENDED|PCRE2_EXTENDED_MORE)) == PCRE2_EXTENDED ||
5138
2.92k
            (unset & PCRE2_EXTENDED) != 0)
5139
226
          unset |= PCRE2_EXTENDED_MORE;
5140
5141
3.15k
        options = (options | set) & (~unset);
5142
3.15k
        xoptions = (xoptions | xset) & (~xunset);
5143
5144
        /* If the options ended with ')' this is not the start of a nested
5145
        group with option changes, so the options change at this level.
5146
        In this case, if the previous level set up a nest block, discard the
5147
        one we have just created. Otherwise adjust it for the previous level.
5148
        If the options ended with ':' we are starting a non-capturing group,
5149
        possibly with an options setting. */
5150
5151
3.15k
        if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5152
3.13k
        if (*ptr++ == CHAR_RIGHT_PARENTHESIS)
5153
1.36k
          {
5154
1.36k
          nest_depth--;  /* This is not a nested group after all. */
5155
1.36k
          if (top_nest > (nest_save *)(cb->start_workspace) &&
5156
696
              (top_nest-1)->nest_depth == nest_depth) top_nest--;
5157
1.10k
          else top_nest->nest_depth = nest_depth;
5158
1.36k
          }
5159
1.76k
        else *parsed_pattern++ = META_NOCAPTURE;
5160
5161
        /* If nothing changed, no need to record. */
5162
5163
3.13k
        if (options != oldoptions || xoptions != oldxoptions)
5164
1.24k
          {
5165
1.24k
          *parsed_pattern++ = META_OPTIONS;
5166
1.24k
          *parsed_pattern++ = options;
5167
1.24k
          *parsed_pattern++ = xoptions;
5168
1.24k
          }
5169
3.13k
        }     /* End options processing */
5170
3.68k
      break;  /* End default case after (? */
5171
5172
5173
      /* ---- Python syntax support ---- */
5174
5175
3.68k
      case CHAR_P:
5176
259
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5177
5178
      /* (?P<name> is the same as (?<name>, which defines a named group. */
5179
5180
258
      if (*ptr == CHAR_LESS_THAN_SIGN)
5181
12
        {
5182
12
        terminator = CHAR_GREATER_THAN_SIGN;
5183
12
        goto DEFINE_NAME;
5184
12
        }
5185
5186
      /* (?P>name) is the same as (?&name), which is a recursion or subroutine
5187
      call. */
5188
5189
246
      if (*ptr == CHAR_GREATER_THAN_SIGN) goto RECURSE_BY_NAME;
5190
5191
      /* (?P=name) is the same as \k<name>, a back reference by name. Anything
5192
      else after (?P is an error. */
5193
5194
58
      if (*ptr != CHAR_EQUALS_SIGN)
5195
11
        {
5196
11
        errorcode = ERR41;
5197
11
        goto FAILED_FORWARD;
5198
11
        }
5199
47
      if (!read_name(&ptr, ptrend, utf, CHAR_RIGHT_PARENTHESIS, &offset, &name,
5200
47
          &namelen, &errorcode, cb)) goto FAILED;
5201
46
      *parsed_pattern++ = META_BACKREF_BYNAME;
5202
46
      *parsed_pattern++ = namelen;
5203
46
      PUTOFFSET(offset, parsed_pattern);
5204
46
      okquantifier = TRUE;
5205
46
      break;   /* End of (?P processing */
5206
5207
5208
      /* ---- Recursion/subroutine calls by number ---- */
5209
5210
640
      case CHAR_R:
5211
640
      i = 0;         /* (?R) == (?R0) */
5212
640
      ptr++;
5213
640
      if (ptr >= ptrend || (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_LEFT_PARENTHESIS))
5214
5
        {
5215
5
        errorcode = ERR58;
5216
5
        goto FAILED;
5217
5
        }
5218
635
      terminator = CHAR_NUL;
5219
635
      goto SET_RECURSION;
5220
5221
      /* An item starting (?- followed by a digit comes here via the "default"
5222
      case because (?- followed by a non-digit is an options setting. */
5223
5224
261
      case CHAR_PLUS:
5225
261
      if (ptr + 1 >= ptrend)
5226
1
        {
5227
1
        ++ptr;
5228
1
        goto UNCLOSED_PARENTHESIS;
5229
1
        }
5230
260
      if (!IS_DIGIT(ptr[1]))
5231
8
        {
5232
8
        errorcode = ERR29;   /* Missing number */
5233
8
        ++ptr;
5234
8
        goto FAILED_FORWARD;
5235
8
        }
5236
252
      PCRE2_FALLTHROUGH /* Fall through */
5237
252
5238
4.03k
      case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
5239
4.88k
      case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
5240
4.95k
      RECURSION_BYNUMBER:
5241
4.95k
      if (!read_number(&ptr, ptrend,
5242
4.95k
          (IS_DIGIT(*ptr))? -1:(int)(cb->bracount), /* + and - are relative */
5243
4.95k
          MAX_GROUP_NUMBER, ERR61,
5244
4.95k
          &i, &errorcode)) goto FAILED;
5245
4.82k
      PCRE2_ASSERT(i >= 0);  /* NB (?0) is permitted, represented by i=0 */
5246
4.82k
      terminator = CHAR_NUL;
5247
5248
10.0k
      SET_RECURSION:
5249
10.0k
      *parsed_pattern++ = META_RECURSE | (uint32_t)i;
5250
10.0k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5251
      /* End of recursive call by number handling */
5252
10.0k
      goto READ_RECURSION_ARGUMENTS;
5253
5254
5255
      /* ---- Recursion/subroutine calls by name ---- */
5256
5257
365
      case CHAR_AMPERSAND:
5258
553
      RECURSE_BY_NAME:
5259
553
      if (!read_name(&ptr, ptrend, utf, 0, &offset, &name,
5260
553
          &namelen, &errorcode, cb)) goto FAILED;
5261
550
      *parsed_pattern++ = META_RECURSE_BYNAME;
5262
550
      *parsed_pattern++ = namelen;
5263
550
      terminator = CHAR_NUL;
5264
5265
10.5k
      READ_RECURSION_ARGUMENTS:
5266
10.5k
      PUTOFFSET(offset, parsed_pattern);
5267
10.5k
      okquantifier = TRUE;
5268
5269
      /* Arguments are not supported for \g construct. */
5270
10.5k
      if (terminator != CHAR_NUL) break;
5271
5272
6.01k
      if (ptr < ptrend && *ptr == CHAR_LEFT_PARENTHESIS)
5273
0
        {
5274
0
        parsed_pattern = parse_capture_list(&ptr, ptrend, utf, parsed_pattern,
5275
0
                                            offset, &errorcode, cb);
5276
0
        if (parsed_pattern == NULL) goto FAILED;
5277
0
        }
5278
5279
6.01k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5280
31
        goto UNCLOSED_PARENTHESIS;
5281
5282
5.98k
      ptr++;
5283
5.98k
      break;
5284
5285
      /* ---- Callout with numerical or string argument ---- */
5286
5287
1.02k
      case CHAR_C:
5288
1.02k
      if ((xoptions & PCRE2_EXTRA_NEVER_CALLOUT) != 0)
5289
0
        {
5290
0
        ptr++;
5291
0
        errorcode = ERR103;
5292
0
        goto FAILED;
5293
0
        }
5294
5295
1.02k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5296
5297
      /* If the previous item was a condition starting (?(? an assertion,
5298
      optionally preceded by a callout, is expected. This is checked later on,
5299
      during actual compilation. However we need to identify this kind of
5300
      assertion in this pass because it must not be qualified. The value of
5301
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5302
      for a callout - still leaving a positive value that identifies the
5303
      assertion. Multiple callouts or any other items will make it zero or
5304
      less, which doesn't matter because they will cause an error later. */
5305
5306
1.02k
      expect_cond_assert = prev_expect_cond_assert - 1;
5307
5308
      /* If previous_callout is not NULL, it means this follows a previous
5309
      callout. If it was a manual callout, do nothing; this means its "length
5310
      of next pattern item" field will remain zero. If it was an automatic
5311
      callout, abolish it. */
5312
5313
1.02k
      if (previous_callout != NULL && (options & PCRE2_AUTO_CALLOUT) != 0 &&
5314
0
          previous_callout == parsed_pattern - 4 &&
5315
0
          parsed_pattern[-1] == 255)
5316
0
        parsed_pattern = previous_callout;
5317
5318
      /* Save for updating next pattern item length, and skip one item before
5319
      completing. */
5320
5321
1.02k
      previous_callout = parsed_pattern;
5322
1.02k
      after_manual_callout = 1;
5323
5324
      /* Handle a string argument; specific delimiter is required. */
5325
5326
1.02k
      if (*ptr != CHAR_RIGHT_PARENTHESIS && !IS_DIGIT(*ptr))
5327
13
        {
5328
13
        PCRE2_SIZE calloutlength;
5329
13
        PCRE2_SPTR startptr = ptr;
5330
5331
13
        delimiter = 0;
5332
117
        for (i = 0; PRIV(callout_start_delims)[i] != 0; i++)
5333
104
          {
5334
104
          if (*ptr == PRIV(callout_start_delims)[i])
5335
0
            {
5336
0
            delimiter = PRIV(callout_end_delims)[i];
5337
0
            break;
5338
0
            }
5339
104
          }
5340
13
        if (delimiter == 0)
5341
13
          {
5342
13
          errorcode = ERR82;
5343
13
          goto FAILED_FORWARD;
5344
13
          }
5345
5346
0
        *parsed_pattern = META_CALLOUT_STRING;
5347
0
        parsed_pattern += 3;   /* Skip pattern info */
5348
5349
0
        for (;;)
5350
0
          {
5351
0
          if (++ptr >= ptrend)
5352
0
            {
5353
0
            errorcode = ERR81;
5354
0
            ptr = startptr;   /* To give a more useful message */
5355
0
            goto FAILED;
5356
0
            }
5357
0
          if (*ptr == delimiter && (++ptr >= ptrend || *ptr != delimiter))
5358
0
            break;
5359
0
          }
5360
5361
0
        calloutlength = (PCRE2_SIZE)(ptr - startptr);
5362
0
        if (calloutlength > UINT32_MAX)
5363
0
          {
5364
0
          errorcode = ERR72;
5365
0
          goto FAILED;
5366
0
          }
5367
0
        *parsed_pattern++ = (uint32_t)calloutlength;
5368
0
        offset = (PCRE2_SIZE)(startptr - cb->start_pattern);
5369
0
        PUTOFFSET(offset, parsed_pattern);
5370
0
        }
5371
5372
      /* Handle a callout with an optional numerical argument, which must be
5373
      less than or equal to 255. A missing argument gives 0. */
5374
5375
1.01k
      else
5376
1.01k
        {
5377
1.01k
        int n = 0;
5378
1.01k
        *parsed_pattern = META_CALLOUT_NUMBER;     /* Numerical callout */
5379
1.01k
        parsed_pattern += 3;                       /* Skip pattern info */
5380
1.14k
        while (ptr < ptrend && IS_DIGIT(*ptr))
5381
135
          {
5382
135
          n = n * 10 + (*ptr++ - CHAR_0);
5383
135
          if (n > 255)
5384
5
            {
5385
5
            errorcode = ERR38;
5386
5
            goto FAILED;
5387
5
            }
5388
135
          }
5389
1.00k
        *parsed_pattern++ = n;
5390
1.00k
        }
5391
5392
      /* Both formats must have a closing parenthesis */
5393
5394
1.00k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5395
7
        {
5396
7
        errorcode = ERR39;
5397
7
        goto FAILED;
5398
7
        }
5399
999
      ptr++;
5400
5401
      /* Remember the offset to the next item in the pattern, and set a default
5402
      length. This should get updated after the next item is read. */
5403
5404
999
      previous_callout[1] = (uint32_t)(ptr - cb->start_pattern);
5405
999
      previous_callout[2] = 0;
5406
999
      break;                  /* End callout */
5407
5408
5409
      /* ---- Conditional group ---- */
5410
5411
      /* A condition can be an assertion, a number (referring to a numbered
5412
      group's having been set), a name (referring to a named group), or 'R',
5413
      referring to overall recursion. R<digits> and R&name are also permitted
5414
      for recursion state tests. Numbers may be preceded by + or - to specify a
5415
      relative group number.
5416
5417
      There are several syntaxes for testing a named group: (?(name)) is used
5418
      by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5419
5420
      There are two unfortunate ambiguities. 'R' can be the recursive thing or
5421
      the name 'R' (and similarly for 'R' followed by digits). 'DEFINE' can be
5422
      the Perl DEFINE feature or the Python named test. We look for a name
5423
      first; if not found, we try the other case.
5424
5425
      For compatibility with auto-callouts, we allow a callout to be specified
5426
      before a condition that is an assertion. */
5427
5428
6.70k
      case CHAR_LEFT_PARENTHESIS:
5429
6.70k
      if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
5430
6.70k
      nest_depth++;
5431
5432
      /* If the next character is ? or * there must be an assertion next
5433
      (optionally preceded by a callout). We do not check this here, but
5434
      instead we set expect_cond_assert to 2. If this is still greater than
5435
      zero (callouts decrement it) when the next assertion is read, it will be
5436
      marked as a condition that must not be repeated. A value greater than
5437
      zero also causes checking that an assertion (possibly with callout)
5438
      follows. */
5439
5440
6.70k
      if (*ptr == CHAR_QUESTION_MARK || *ptr == CHAR_ASTERISK)
5441
1.08k
        {
5442
1.08k
        *parsed_pattern++ = META_COND_ASSERT;
5443
1.08k
        ptr--;   /* Pull pointer back to the opening parenthesis. */
5444
1.08k
        expect_cond_assert = 2;
5445
1.08k
        break;  /* End of conditional */
5446
1.08k
        }
5447
5448
      /* Handle (?([+-]number)... */
5449
5450
5.62k
      if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61, &i,
5451
5.62k
          &errorcode))
5452
1.31k
        {
5453
1.31k
        PCRE2_ASSERT(i >= 0);
5454
1.31k
        if (i <= 0)
5455
4
          {
5456
4
          errorcode = ERR15;
5457
4
          goto FAILED;
5458
4
          }
5459
1.30k
        *parsed_pattern++ = META_COND_NUMBER;
5460
1.30k
        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5461
1.30k
        PUTOFFSET(offset, parsed_pattern);
5462
1.30k
        *parsed_pattern++ = i;
5463
1.30k
        }
5464
4.31k
      else if (errorcode != 0) goto FAILED;   /* Number too big */
5465
5466
      /* No number found. Handle the special case (?(VERSION[>]=n.m)... */
5467
5468
4.23k
      else if (ptrend - ptr >= 10 &&
5469
3.90k
               PRIV(strncmp_c8)(ptr, STRING_VERSION, 7) == 0 &&
5470
0
               ptr[7] != CHAR_RIGHT_PARENTHESIS)
5471
0
        {
5472
0
        uint32_t ge = 0;
5473
0
        int major = 0;
5474
0
        int minor = 0;
5475
5476
0
        ptr += 7;
5477
0
        if (*ptr == CHAR_GREATER_THAN_SIGN)
5478
0
          {
5479
0
          ge = 1;
5480
0
          ptr++;
5481
0
          }
5482
5483
        /* NOTE: cannot write IS_DIGIT(*(++ptr)) here because IS_DIGIT
5484
        references its argument twice. */
5485
5486
0
        if (*ptr != CHAR_EQUALS_SIGN || (ptr++, !IS_DIGIT(*ptr)))
5487
0
          {
5488
0
          errorcode = ERR79;
5489
0
          if (!ge) goto FAILED_FORWARD;
5490
0
          goto FAILED;
5491
0
          }
5492
5493
0
        if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &major, &errorcode))
5494
0
          goto FAILED;
5495
5496
0
        if (ptr < ptrend && *ptr == CHAR_DOT)
5497
0
          {
5498
0
          if (++ptr >= ptrend || !IS_DIGIT(*ptr))
5499
0
            {
5500
0
            errorcode = ERR79;
5501
0
            if (ptr < ptrend) goto FAILED_FORWARD;
5502
0
            goto FAILED;
5503
0
            }
5504
0
          if (!read_number(&ptr, ptrend, -1, 1000, ERR79, &minor, &errorcode))
5505
0
            goto FAILED;
5506
0
          }
5507
0
        if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5508
0
          {
5509
0
          errorcode = ERR79;
5510
0
          if (ptr < ptrend) goto FAILED_FORWARD;
5511
0
          goto FAILED;
5512
0
          }
5513
5514
0
        *parsed_pattern++ = META_COND_VERSION;
5515
0
        *parsed_pattern++ = ge;
5516
0
        *parsed_pattern++ = major;
5517
0
        *parsed_pattern++ = minor;
5518
0
        }
5519
5520
      /* All the remaining cases now require us to read a name. We cannot at
5521
      this stage distinguish ambiguous cases such as (?(R12) which might be a
5522
      recursion test by number or a name, because the named groups have not yet
5523
      all been identified. Those cases are treated as names, but given a
5524
      different META code. */
5525
5526
4.23k
      else
5527
4.23k
        {
5528
4.23k
        BOOL was_r_ampersand = FALSE;
5529
5530
4.23k
        if (*ptr == CHAR_R && ptrend - ptr > 1 && ptr[1] == CHAR_AMPERSAND)
5531
130
          {
5532
130
          terminator = CHAR_RIGHT_PARENTHESIS;
5533
130
          was_r_ampersand = TRUE;
5534
130
          ptr++;
5535
130
          }
5536
4.10k
        else if (*ptr == CHAR_LESS_THAN_SIGN)
5537
182
          terminator = CHAR_GREATER_THAN_SIGN;
5538
3.92k
        else if (*ptr == CHAR_APOSTROPHE)
5539
139
          terminator = CHAR_APOSTROPHE;
5540
3.78k
        else
5541
3.78k
          {
5542
3.78k
          terminator = CHAR_RIGHT_PARENTHESIS;
5543
3.78k
          ptr--;   /* Point to char before name */
5544
3.78k
          }
5545
5546
4.23k
        if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5547
4.23k
            &errorcode, cb)) goto FAILED;
5548
5549
        /* Handle (?(R&name) */
5550
5551
4.17k
        if (was_r_ampersand)
5552
129
          {
5553
129
          *parsed_pattern = META_COND_RNAME;
5554
129
          ptr--;   /* Back to closing parens */
5555
129
          }
5556
5557
        /* Handle (?(name). If the name is "DEFINE" we identify it with a
5558
        special code. Likewise if the name consists of R followed only by
5559
        digits. Otherwise, handle it like a quoted name. */
5560
5561
4.04k
        else if (terminator == CHAR_RIGHT_PARENTHESIS)
5562
3.73k
          {
5563
3.73k
          if (namelen == 6 && PRIV(strncmp_c8)(name, STRING_DEFINE, 6) == 0)
5564
288
            *parsed_pattern = META_COND_DEFINE;
5565
3.44k
          else
5566
3.44k
            {
5567
4.16k
            for (i = 1; i < (int)namelen; i++)
5568
869
              if (!IS_DIGIT(name[i])) break;
5569
3.44k
            *parsed_pattern = (*name == CHAR_R && i >= (int)namelen)?
5570
2.47k
              META_COND_RNUMBER : META_COND_NAME;
5571
3.44k
            }
5572
3.73k
          ptr--;   /* Back to closing parens */
5573
3.73k
          }
5574
5575
        /* Handle (?('name') or (?(<name>) */
5576
5577
309
        else *parsed_pattern = META_COND_NAME;
5578
5579
        /* All these cases except DEFINE end with the name length and offset;
5580
        DEFINE just has an offset (for the "too many branches" error). */
5581
5582
4.17k
        if (*parsed_pattern++ != META_COND_DEFINE) *parsed_pattern++ = namelen;
5583
4.17k
        PUTOFFSET(offset, parsed_pattern);
5584
4.17k
        }  /* End cases that read a name */
5585
5586
      /* Check the closing parenthesis of the condition */
5587
5588
5.48k
      if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)
5589
16
        {
5590
16
        errorcode = ERR24;
5591
16
        goto FAILED;
5592
16
        }
5593
5.46k
      ptr++;
5594
5.46k
      break;  /* End of condition processing */
5595
5596
5597
      /* ---- Atomic group ---- */
5598
5599
1.77k
      case CHAR_GREATER_THAN_SIGN:
5600
1.77k
      ATOMIC_GROUP:                          /* Come from (*atomic: */
5601
1.77k
      *parsed_pattern++ = META_ATOMIC;
5602
1.77k
      nest_depth++;
5603
1.77k
      ptr++;
5604
1.77k
      break;
5605
5606
5607
      /* ---- Lookahead assertions ---- */
5608
5609
2.59k
      case CHAR_EQUALS_SIGN:
5610
2.59k
      POSITIVE_LOOK_AHEAD:                   /* Come from (*pla: */
5611
2.59k
      *parsed_pattern++ = META_LOOKAHEAD;
5612
2.59k
      ptr++;
5613
2.59k
      goto POST_ASSERTION;
5614
5615
0
      case CHAR_ASTERISK:
5616
0
      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (*napla: */
5617
0
      *parsed_pattern++ = META_LOOKAHEAD_NA;
5618
0
      ptr++;
5619
0
      goto POST_ASSERTION;
5620
5621
1.52k
      case CHAR_EXCLAMATION_MARK:
5622
1.52k
      NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */
5623
1.52k
      *parsed_pattern++ = META_LOOKAHEADNOT;
5624
1.52k
      ptr++;
5625
1.52k
      goto POST_ASSERTION;
5626
5627
5628
      /* ---- Lookbehind assertions ---- */
5629
5630
      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?<
5631
      is the start of the name of a capturing group. */
5632
5633
9.47k
      case CHAR_LESS_THAN_SIGN:
5634
9.47k
      if (ptrend - ptr <= 1 ||
5635
9.47k
         (ptr[1] != CHAR_EQUALS_SIGN &&
5636
8.89k
          ptr[1] != CHAR_EXCLAMATION_MARK &&
5637
7.71k
          ptr[1] != CHAR_ASTERISK))
5638
7.71k
        {
5639
7.71k
        terminator = CHAR_GREATER_THAN_SIGN;
5640
7.71k
        goto DEFINE_NAME;
5641
7.71k
        }
5642
1.76k
      *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)?
5643
1.18k
        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)?
5644
1.18k
        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA;
5645
5646
1.76k
      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */
5647
1.76k
      *has_lookbehind = TRUE;
5648
1.76k
      offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
5649
1.76k
      PUTOFFSET(offset, parsed_pattern);
5650
1.76k
      ptr += 2;
5651
      /* Fall through */
5652
5653
      /* If the previous item was a condition starting (?(? an assertion,
5654
      optionally preceded by a callout, is expected. This is checked later on,
5655
      during actual compilation. However we need to identify this kind of
5656
      assertion in this pass because it must not be qualified. The value of
5657
      expect_cond_assert is set to 2 after (?(? is processed. We decrement it
5658
      for a callout - still leaving a positive value that identifies the
5659
      assertion. Multiple callouts or any other items will make it zero or
5660
      less, which doesn't matter because they will cause an error later. */
5661
5662
5.87k
      POST_ASSERTION:
5663
5.87k
      nest_depth++;
5664
5.87k
      if (prev_expect_cond_assert > 0)
5665
1.03k
        {
5666
1.03k
        if (top_nest == NULL) top_nest = (nest_save *)(cb->start_workspace);
5667
643
        else if (++top_nest >= end_nests)
5668
0
          {
5669
0
          errorcode = ERR84;
5670
0
          goto FAILED;
5671
0
          }
5672
1.03k
        top_nest->nest_depth = nest_depth;
5673
1.03k
        top_nest->flags = NSF_CONDASSERT;
5674
1.03k
        top_nest->options = options & PARSE_TRACKED_OPTIONS;
5675
1.03k
        top_nest->xoptions = xoptions & PARSE_TRACKED_EXTRA_OPTIONS;
5676
1.03k
        }
5677
5.87k
      break;
5678
5679
5680
      /* ---- Define a named group ---- */
5681
5682
      /* A named group may be defined as (?'name') or (?<name>). In the latter
5683
      case we jump to DEFINE_NAME from the disambiguation of (?< above with the
5684
      terminator set to '>'. */
5685
5686
5.87k
      case CHAR_APOSTROPHE:
5687
2.02k
      terminator = CHAR_APOSTROPHE;    /* Terminator */
5688
5689
9.75k
      DEFINE_NAME:
5690
9.75k
      if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name, &namelen,
5691
9.75k
          &errorcode, cb)) goto FAILED;
5692
5693
      /* We have a name for this capturing group. It is also assigned a number,
5694
      which is its primary means of identification. */
5695
5696
9.53k
      if (cb->bracount >= MAX_GROUP_NUMBER)
5697
0
        {
5698
0
        errorcode = ERR97;
5699
0
        goto FAILED;
5700
0
        }
5701
9.53k
      cb->bracount++;
5702
9.53k
      *parsed_pattern++ = META_CAPTURE | cb->bracount;
5703
9.53k
      nest_depth++;
5704
5705
      /* Check not too many names */
5706
5707
9.53k
      if (cb->names_found >= MAX_NAME_COUNT)
5708
0
        {
5709
0
        errorcode = ERR49;
5710
0
        goto FAILED;
5711
0
        }
5712
5713
      /* Adjust the entry size to accommodate the longest name found. */
5714
5715
9.53k
      if (namelen + IMM2_SIZE + 1 > cb->name_entry_size)
5716
1.20k
        cb->name_entry_size = (uint16_t)(namelen + IMM2_SIZE + 1);
5717
5718
      /* Scan the list to check for duplicates. For duplicate names, if the
5719
      number is the same, break the loop, which causes the name to be
5720
      discarded; otherwise, if DUPNAMES is not set, give an error.
5721
      If it is set, allow the name with a different number, but continue
5722
      scanning in case this is a duplicate with the same number. For
5723
      non-duplicate names, give an error if the number is duplicated. */
5724
5725
9.53k
      is_dupname = FALSE;
5726
9.53k
      hash = PRIV(compile_get_hash_from_name)(name, namelen);
5727
9.53k
      ng = cb->named_groups;
5728
57.8k
      for (i = 0; i < cb->names_found; i++, ng++)
5729
56.1k
        {
5730
56.1k
        if (namelen == ng->length && hash == NAMED_GROUP_GET_HASH(ng) &&
5731
7.88k
            PRIV(strncmp)(name, ng->name, (PCRE2_SIZE)namelen) == 0)
5732
7.79k
          {
5733
          /* When a bracket is referenced by the same name multiple
5734
          times, is not considered as a duplicate and ignored. */
5735
7.79k
          if (ng->number == cb->bracount) break;
5736
7.79k
          if ((options & PCRE2_DUPNAMES) == 0)
5737
2
            {
5738
2
            errorcode = ERR43;
5739
2
            goto FAILED;
5740
2
            }
5741
5742
7.79k
          ng->hash_dup |= NAMED_GROUP_IS_DUPNAME;
5743
7.79k
          is_dupname = TRUE;                /* Mark as a duplicate */
5744
7.79k
          cb->dupnames = TRUE;              /* Duplicate names exist */
5745
5746
          /* The entry represents a duplicate. */
5747
7.79k
          name = ng->name;
5748
7.79k
          namelen = 0;
5749
5750
          /* Even duplicated names may refer to the same
5751
          capture index. These references are also ignored. */
5752
10.3M
          for (; i < cb->names_found; i++, ng++)
5753
10.3M
            if (ng->name == name && ng->number == cb->bracount)
5754
1
              break;
5755
7.79k
          break;
5756
7.79k
          }
5757
48.3k
        else if (ng->number == cb->bracount)
5758
0
          {
5759
0
          errorcode = ERR65;
5760
0
          goto FAILED;
5761
0
          }
5762
56.1k
        }
5763
5764
      /* Ignore duplicate with same number. */
5765
9.53k
      if (i < cb->names_found) break;
5766
5767
      /* Increase the list size if necessary */
5768
5769
9.53k
      if (cb->names_found >= cb->named_group_list_size)
5770
48
        {
5771
48
        uint32_t newsize = cb->named_group_list_size * 2;
5772
48
        named_group *newspace =
5773
48
          cb->cx->memctl.malloc(newsize * sizeof(named_group),
5774
48
          cb->cx->memctl.memory_data);
5775
48
        if (newspace == NULL)
5776
0
          {
5777
0
          errorcode = ERR21;
5778
0
          goto FAILED;
5779
0
          }
5780
5781
48
        memcpy(newspace, cb->named_groups,
5782
48
          cb->named_group_list_size * sizeof(named_group));
5783
48
        if (cb->named_group_list_size > NAMED_GROUP_LIST_SIZE)
5784
31
          cb->cx->memctl.free((void *)cb->named_groups,
5785
31
          cb->cx->memctl.memory_data);
5786
48
        cb->named_groups = newspace;
5787
48
        cb->named_group_list_size = newsize;
5788
48
        }
5789
5790
      /* Add this name to the list */
5791
9.53k
      if (is_dupname)
5792
7.79k
        hash |= NAMED_GROUP_IS_DUPNAME;
5793
5794
9.53k
      cb->named_groups[cb->names_found].name = name;
5795
9.53k
      cb->named_groups[cb->names_found].length = (uint16_t)namelen;
5796
9.53k
      cb->named_groups[cb->names_found].number = cb->bracount;
5797
9.53k
      cb->named_groups[cb->names_found].hash_dup = hash;
5798
9.53k
      cb->names_found++;
5799
9.53k
      break;
5800
5801
5802
      /* ---- Perl extended character class ---- */
5803
5804
      /* These are of the form '(?[...])'. We handle these via the same parser
5805
      that consumes ordinary '[...]' classes, but with a flag set to activate
5806
      the extended behaviour. */
5807
5808
0
      case CHAR_LEFT_SQUARE_BRACKET:
5809
0
      class_mode_state = CLASS_MODE_PERL_EXT;
5810
0
      c = *ptr++;
5811
0
      goto FROM_PERL_EXTENDED_CLASS;
5812
35.0k
      }        /* End of (? switch */
5813
38.9k
    break;     /* End of ( handling */
5814
5815
5816
    /* ---- Branch terminators ---- */
5817
5818
    /* Alternation: reset the capture count if we are in a (?| group. */
5819
5820
38.9k
    case CHAR_VERTICAL_LINE:
5821
18.7k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth &&
5822
2.53k
        (top_nest->flags & NSF_RESET) != 0)
5823
279
      {
5824
279
      if (cb->bracount > top_nest->max_group)
5825
27
        top_nest->max_group = (uint16_t)cb->bracount;
5826
279
      cb->bracount = top_nest->reset_group;
5827
279
      }
5828
18.7k
    *parsed_pattern++ = META_ALT;
5829
18.7k
    break;
5830
5831
    /* End of group; reset the capture count to the maximum if we are in a (?|
5832
    group and/or reset the options that are tracked during parsing. Disallow
5833
    quantifier for a condition that is an assertion. */
5834
5835
410k
    case CHAR_RIGHT_PARENTHESIS:
5836
410k
    okquantifier = TRUE;
5837
410k
    if (top_nest != NULL && top_nest->nest_depth == nest_depth)
5838
3.52k
      {
5839
3.52k
      options = (options & ~PARSE_TRACKED_OPTIONS) | top_nest->options;
5840
3.52k
      xoptions = (xoptions & ~PARSE_TRACKED_EXTRA_OPTIONS) | top_nest->xoptions;
5841
3.52k
      if ((top_nest->flags & NSF_RESET) != 0 &&
5842
505
          top_nest->max_group > cb->bracount)
5843
12
        cb->bracount = top_nest->max_group;
5844
3.52k
      if ((top_nest->flags & NSF_CONDASSERT) != 0)
5845
1.00k
        okquantifier = FALSE;
5846
5847
3.52k
      if ((top_nest->flags & NSF_ATOMICSR) != 0)
5848
0
        {
5849
0
        *parsed_pattern++ = META_KET;
5850
5851
#ifdef PCRE2_DEBUG
5852
        PCRE2_ASSERT(parsed_pattern_extra > 0);
5853
        parsed_pattern_extra--;
5854
#endif
5855
0
        }
5856
5857
3.52k
      if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;
5858
2.19k
        else top_nest--;
5859
3.52k
      }
5860
410k
    if (nest_depth == 0)    /* Unmatched closing parenthesis */
5861
289
      {
5862
289
      errorcode = ERR22;
5863
289
      goto FAILED;
5864
289
      }
5865
410k
    nest_depth--;
5866
410k
    *parsed_pattern++ = META_KET;
5867
410k
    break;
5868
1.30M
    }  /* End of switch on pattern character */
5869
1.30M
  }    /* End of main character scan loop */
5870
5871
/* End of pattern reached. Check for missing ) at the end of a verb name. */
5872
5873
8.25k
if (inverbname && ptr >= ptrend)
5874
30
  {
5875
30
  errorcode = ERR60;
5876
30
  goto FAILED;
5877
30
  }
5878
5879
5880
8.22k
PARSED_END:
5881
5882
8.22k
PCRE2_ASSERT((parsed_pattern - parsed_pattern_check) +
5883
8.22k
             (parsed_pattern_extra - parsed_pattern_extra_check) <=
5884
8.22k
               max_parsed_pattern(ptr_check, ptr, utf, options));
5885
5886
/* Manage callout for the final item */
5887
5888
8.22k
parsed_pattern = manage_callouts(ptr, &previous_callout, auto_callout,
5889
8.22k
  parsed_pattern, cb);
5890
5891
/* Insert trailing items for word and line matching (features provided for the
5892
benefit of pcre2grep). */
5893
5894
8.22k
if ((xoptions & PCRE2_EXTRA_MATCH_LINE) != 0)
5895
0
  {
5896
0
  *parsed_pattern++ = META_KET;
5897
0
  *parsed_pattern++ = META_DOLLAR;
5898
0
  }
5899
8.22k
else if ((xoptions & PCRE2_EXTRA_MATCH_WORD) != 0)
5900
0
  {
5901
0
  *parsed_pattern++ = META_KET;
5902
0
  *parsed_pattern++ = META_ESCAPE + ESC_b;
5903
0
  }
5904
5905
/* Terminate the parsed pattern, then return success if all groups are closed.
5906
Otherwise we have unclosed parentheses. */
5907
5908
/* LCOV_EXCL_START */
5909
8.22k
if (parsed_pattern >= parsed_pattern_end)
5910
0
  {
5911
0
  PCRE2_DEBUG_UNREACHABLE();
5912
0
  errorcode = ERR63;  /* Internal error (parsed pattern overflow) */
5913
0
  goto FAILED;
5914
0
  }
5915
/* LCOV_EXCL_STOP */
5916
5917
8.22k
*parsed_pattern = META_END;
5918
8.22k
if (nest_depth == 0) return 0;
5919
5920
611
UNCLOSED_PARENTHESIS:
5921
611
errorcode = ERR14;
5922
5923
/* Come here for all failures. */
5924
5925
3.38k
FAILED:
5926
3.38k
cb->erroroffset = (PCRE2_SIZE)(ptr - cb->start_pattern);
5927
3.38k
return errorcode;
5928
5929
/* Some errors need to indicate the previous character. */
5930
5931
2
FAILED_BACK:
5932
2
ptr--;
5933
2
#ifdef SUPPORT_UNICODE
5934
2
if (utf) BACKCHAR(ptr);
5935
2
#endif
5936
2
goto FAILED;
5937
5938
/* Some errors need to indicate the next character. */
5939
5940
33
FAILED_FORWARD:
5941
33
ptr++;
5942
33
#ifdef SUPPORT_UNICODE
5943
33
if (utf) FORWARDCHARTEST(ptr, ptrend);
5944
33
#endif
5945
33
goto FAILED;
5946
611
}
5947
5948
5949
5950
/*************************************************
5951
*       Find first significant opcode            *
5952
*************************************************/
5953
5954
/* This is called by several functions that scan a compiled expression looking
5955
for a fixed first character, or an anchoring opcode etc. It skips over things
5956
that do not influence this. For some calls, it makes sense to skip negative
5957
forward and all backward assertions, and also the \b assertion; for others it
5958
does not.
5959
5960
Arguments:
5961
  code         pointer to the start of the group
5962
  skipassert   TRUE if certain assertions are to be skipped
5963
5964
Returns:       pointer to the first significant opcode
5965
*/
5966
5967
static const PCRE2_UCHAR*
5968
first_significant_code(PCRE2_SPTR code, BOOL skipassert)
5969
39.5k
{
5970
39.5k
for (;;)
5971
41.5k
  {
5972
41.5k
  switch ((int)*code)
5973
41.5k
    {
5974
175
    case OP_ASSERT_NOT:
5975
592
    case OP_ASSERTBACK:
5976
2.11k
    case OP_ASSERTBACK_NOT:
5977
2.11k
    case OP_ASSERTBACK_NA:
5978
2.11k
    if (!skipassert) return code;
5979
1.06k
    do code += GET(code, 1); while (*code == OP_ALT);
5980
761
    code += PRIV(OP_lengths)[*code];
5981
761
    break;
5982
5983
112
    case OP_WORD_BOUNDARY:
5984
197
    case OP_NOT_WORD_BOUNDARY:
5985
197
    case OP_UCP_WORD_BOUNDARY:
5986
197
    case OP_NOT_UCP_WORD_BOUNDARY:
5987
197
    if (!skipassert) return code;
5988
120
    PCRE2_FALLTHROUGH /* Fall through */
5989
120
5990
227
    case OP_CALLOUT:
5991
229
    case OP_CREF:
5992
229
    case OP_DNCREF:
5993
553
    case OP_RREF:
5994
553
    case OP_DNRREF:
5995
553
    case OP_FALSE:
5996
553
    case OP_TRUE:
5997
553
    code += PRIV(OP_lengths)[*code];
5998
553
    break;
5999
6000
0
    case OP_CALLOUT_STR:
6001
0
    code += GET(code, 1 + 2*LINK_SIZE);
6002
0
    break;
6003
6004
45
    case OP_SKIPZERO:
6005
45
    code += 2 + GET(code, 2) + LINK_SIZE;
6006
45
    break;
6007
6008
1.33k
    case OP_COND:
6009
1.42k
    case OP_SCOND:
6010
1.42k
    if (code[1+LINK_SIZE] != OP_FALSE ||   /* Not DEFINE */
6011
78
        code[GET(code, 1)] != OP_KET)      /* More than one branch */
6012
1.34k
      return code;
6013
78
    code += GET(code, 1) + 1 + LINK_SIZE;
6014
78
    break;
6015
6016
162
    case OP_MARK:
6017
162
    case OP_COMMIT_ARG:
6018
311
    case OP_PRUNE_ARG:
6019
404
    case OP_SKIP_ARG:
6020
495
    case OP_THEN_ARG:
6021
495
    code += code[1] + PRIV(OP_lengths)[*code];
6022
495
    break;
6023
6024
36.8k
    default:
6025
36.8k
    return code;
6026
41.5k
    }
6027
41.5k
  }
6028
6029
/* LCOV_EXCL_START */
6030
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
6031
/* LCOV_EXCL_STOP */
6032
0
}
6033
6034
6035
6036
/*************************************************
6037
*           Compile one branch                   *
6038
*************************************************/
6039
6040
/* Scan the parsed pattern, compiling it into the a vector of PCRE2_UCHAR. If
6041
the options are changed during the branch, the pointer is used to change the
6042
external options bits. This function is used during the pre-compile phase when
6043
we are trying to find out the amount of memory needed, as well as during the
6044
real compile phase. The value of lengthptr distinguishes the two phases.
6045
6046
Arguments:
6047
  optionsptr        pointer to the option bits
6048
  xoptionsptr       pointer to the extra option bits
6049
  codeptr           points to the pointer to the current code point
6050
  pptrptr           points to the current parsed pattern pointer
6051
  errorcodeptr      points to error code variable
6052
  firstcuptr        place to put the first required code unit
6053
  firstcuflagsptr   place to put the first code unit flags
6054
  reqcuptr          place to put the last required code unit
6055
  reqcuflagsptr     place to put the last required code unit flags
6056
  bcptr             points to current branch chain
6057
  open_caps         points to current capitem
6058
  cb                contains pointers to tables etc.
6059
  lengthptr         NULL during the real compile phase
6060
                    points to length accumulator during pre-compile phase
6061
6062
Returns:            0 There's been an error, *errorcodeptr is non-zero
6063
                   +1 Success, this branch must match at least one character
6064
                   -1 Success, this branch may match an empty string
6065
*/
6066
6067
static int
6068
compile_branch(uint32_t *optionsptr, uint32_t *xoptionsptr,
6069
  PCRE2_UCHAR **codeptr, uint32_t **pptrptr, int *errorcodeptr,
6070
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
6071
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
6072
  compile_block *cb, PCRE2_SIZE *lengthptr)
6073
101k
{
6074
101k
int bravalue = 0;
6075
101k
int okreturn = -1;
6076
101k
int group_return = 0;
6077
101k
uint32_t repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
6078
101k
uint32_t greedy_default, greedy_non_default;
6079
101k
uint32_t repeat_type, op_type;
6080
101k
uint32_t options = *optionsptr;               /* May change dynamically */
6081
101k
uint32_t xoptions = *xoptionsptr;             /* May change dynamically */
6082
101k
uint32_t firstcu, reqcu;
6083
101k
uint32_t zeroreqcu, zerofirstcu;
6084
101k
uint32_t *pptr = *pptrptr;
6085
101k
uint32_t meta, meta_arg;
6086
101k
uint32_t firstcuflags, reqcuflags;
6087
101k
uint32_t zeroreqcuflags, zerofirstcuflags;
6088
101k
uint32_t req_caseopt, reqvary, tempreqvary;
6089
101k
uint32_t j;
6090
101k
int i;
6091
/* Some opcodes, such as META_CAPTURE_NUMBER or META_CAPTURE_NAME,
6092
depends on the previous value of offset. */
6093
101k
PCRE2_SIZE offset = 0;
6094
101k
PCRE2_SIZE length_prevgroup = 0;
6095
101k
PCRE2_UCHAR *code = *codeptr;
6096
101k
PCRE2_UCHAR *last_code = code;
6097
101k
PCRE2_UCHAR *orig_code = code;
6098
101k
PCRE2_UCHAR *tempcode;
6099
101k
PCRE2_UCHAR *previous = NULL;
6100
101k
PCRE2_UCHAR op_previous;
6101
101k
BOOL groupsetfirstcu = FALSE;
6102
101k
BOOL had_accept = FALSE;
6103
101k
BOOL matched_char = FALSE;
6104
101k
BOOL previous_matched_char = FALSE;
6105
101k
BOOL reset_caseful = FALSE;
6106
6107
/* We can fish out the UTF setting once and for all into a BOOL, but we must
6108
not do this for other options (e.g. PCRE2_EXTENDED) that may change dynamically
6109
as we process the pattern. */
6110
6111
101k
#ifdef SUPPORT_UNICODE
6112
101k
BOOL utf = (options & PCRE2_UTF) != 0;
6113
101k
BOOL ucp = (options & PCRE2_UCP) != 0;
6114
#else  /* No Unicode support */
6115
BOOL utf = FALSE;
6116
#endif
6117
6118
/* Set up the default and non-default settings for greediness */
6119
6120
101k
greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6121
101k
greedy_non_default = greedy_default ^ 1;
6122
6123
/* Initialize no first unit, no required unit. REQ_UNSET means "no char
6124
matching encountered yet". It gets changed to REQ_NONE if we hit something that
6125
matches a non-fixed first unit; reqcu just remains unset if we never find one.
6126
6127
When we hit a repeat whose minimum is zero, we may have to adjust these values
6128
to take the zero repeat into account. This is implemented by setting them to
6129
zerofirstcu and zeroreqcu when such a repeat is encountered. The individual
6130
item types that can be repeated set these backoff variables appropriately. */
6131
6132
101k
firstcu = reqcu = zerofirstcu = zeroreqcu = 0;
6133
101k
firstcuflags = reqcuflags = zerofirstcuflags = zeroreqcuflags = REQ_UNSET;
6134
6135
/* The variable req_caseopt contains either the REQ_CASELESS bit or zero,
6136
according to the current setting of the caseless flag. The REQ_CASELESS value
6137
leaves the lower 28 bit empty. It is added into the firstcu or reqcu variables
6138
to record the case status of the value. This is used only for ASCII characters.
6139
*/
6140
6141
101k
req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6142
6143
/* Switch on next META item until the end of the branch */
6144
6145
259k
for (;; pptr++)
6146
361k
  {
6147
361k
  BOOL possessive_quantifier;
6148
361k
  BOOL note_group_empty;
6149
361k
  uint32_t mclength;
6150
361k
  uint32_t skipunits;
6151
361k
  uint32_t subreqcu, subfirstcu;
6152
361k
  uint32_t groupnumber;
6153
361k
  uint32_t verbarglen, verbculen;
6154
361k
  uint32_t subreqcuflags, subfirstcuflags;
6155
361k
  open_capitem *oc;
6156
361k
  PCRE2_UCHAR mcbuffer[8];
6157
6158
  /* Get next META item in the pattern and its potential argument. */
6159
6160
361k
  meta = META_CODE(*pptr);
6161
361k
  meta_arg = META_DATA(*pptr);
6162
6163
  /* If we are in the pre-compile phase, accumulate the length used for the
6164
  previous cycle of this loop, unless the next item is a quantifier. */
6165
6166
361k
  if (lengthptr != NULL)
6167
191k
    {
6168
    /* LCOV_EXCL_START */
6169
191k
    if (code >= cb->start_workspace + cb->workspace_size)
6170
0
      {
6171
0
      PCRE2_DEBUG_UNREACHABLE();
6172
0
      *errorcodeptr = ERR52;  /* Over-ran workspace - internal error */
6173
0
      cb->erroroffset = 0;
6174
0
      return 0;
6175
0
      }
6176
    /* LCOV_EXCL_STOP */
6177
6178
191k
    if (code > cb->start_workspace + cb->workspace_size -
6179
191k
        WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
6180
0
      {
6181
0
      *errorcodeptr = ERR86;  /* Pattern too complicated */
6182
0
      cb->erroroffset = 0;
6183
0
      return 0;
6184
0
      }
6185
6186
    /* There is at least one situation where code goes backwards: this is the
6187
    case of a zero quantifier after a class (e.g. [ab]{0}). When the quantifier
6188
    is processed, the whole class is eliminated. However, it is created first,
6189
    so we have to allow memory for it. Therefore, don't ever reduce the length
6190
    at this point. */
6191
6192
191k
    if (code < last_code) code = last_code;
6193
6194
    /* If the next thing is not a quantifier, we add the length of the previous
6195
    item into the total, and reset the code pointer to the start of the
6196
    workspace. Otherwise leave the previous item available to be quantified. */
6197
6198
191k
    if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6199
172k
      {
6200
172k
      if (OFLOW_MAX - *lengthptr < (PCRE2_SIZE)(code - orig_code))
6201
0
        {
6202
0
        *errorcodeptr = ERR20;   /* Integer overflow */
6203
0
        cb->erroroffset = 0;
6204
0
        return 0;
6205
0
        }
6206
172k
      *lengthptr += (PCRE2_SIZE)(code - orig_code);
6207
172k
      if (*lengthptr > MAX_PATTERN_SIZE)
6208
179
        {
6209
179
        *errorcodeptr = ERR20;   /* Pattern is too large */
6210
179
        cb->erroroffset = 0;
6211
179
        return 0;
6212
179
        }
6213
172k
      code = orig_code;
6214
172k
      }
6215
6216
    /* Remember where this code item starts so we can catch the "backwards"
6217
    case above next time round. */
6218
6219
191k
    last_code = code;
6220
191k
    }
6221
6222
  /* Process the next parsed pattern item. If it is not a quantifier, remember
6223
  where it starts so that it can be quantified when a quantifier follows.
6224
  Checking for the legality of quantifiers happens in parse_regex(), except for
6225
  a quantifier after an assertion that is a condition. */
6226
6227
360k
  if (meta < META_ASTERISK || meta > META_MINMAX_QUERY)
6228
323k
    {
6229
323k
    previous = code;
6230
323k
    if (matched_char && !had_accept) okreturn = 1;
6231
323k
    }
6232
6233
360k
  previous_matched_char = matched_char;
6234
360k
  matched_char = FALSE;
6235
360k
  note_group_empty = FALSE;
6236
360k
  skipunits = 0;         /* Default value for most subgroups */
6237
6238
360k
  switch(meta)
6239
360k
    {
6240
    /* ===================================================================*/
6241
    /* The branch terminates at pattern end or | or ) */
6242
6243
12.4k
    case META_END:
6244
33.7k
    case META_ALT:
6245
100k
    case META_KET:
6246
100k
    *firstcuptr = firstcu;
6247
100k
    *firstcuflagsptr = firstcuflags;
6248
100k
    *reqcuptr = reqcu;
6249
100k
    *reqcuflagsptr = reqcuflags;
6250
100k
    *codeptr = code;
6251
100k
    *pptrptr = pptr;
6252
100k
    return okreturn;
6253
6254
6255
    /* ===================================================================*/
6256
    /* Handle single-character metacharacters. In multiline mode, ^ disables
6257
    the setting of any following char as a first character. */
6258
6259
3.91k
    case META_CIRCUMFLEX:
6260
3.91k
    if ((options & PCRE2_MULTILINE) != 0)
6261
820
      {
6262
820
      if (firstcuflags == REQ_UNSET)
6263
587
        zerofirstcuflags = firstcuflags = REQ_NONE;
6264
820
      *code++ = OP_CIRCM;
6265
820
      }
6266
3.09k
    else *code++ = OP_CIRC;
6267
3.91k
    break;
6268
6269
710
    case META_DOLLAR:
6270
710
    *code++ = ((options & PCRE2_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
6271
710
    break;
6272
6273
    /* There can never be a first char if '.' is first, whatever happens about
6274
    repeats. The value of reqcu doesn't change either. */
6275
6276
7.34k
    case META_DOT:
6277
7.34k
    matched_char = TRUE;
6278
7.34k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6279
7.34k
    zerofirstcu = firstcu;
6280
7.34k
    zerofirstcuflags = firstcuflags;
6281
7.34k
    zeroreqcu = reqcu;
6282
7.34k
    zeroreqcuflags = reqcuflags;
6283
7.34k
    *code++ = ((options & PCRE2_DOTALL) != 0)? OP_ALLANY: OP_ANY;
6284
7.34k
    break;
6285
6286
6287
    /* ===================================================================*/
6288
    /* Empty character classes are allowed if PCRE2_ALLOW_EMPTY_CLASS is set.
6289
    Otherwise, an initial ']' is taken as a data character. When empty classes
6290
    are allowed, [] must generate an empty class - we have no dedicated opcode
6291
    to optimise the representation, but it's a rare case (the '(*FAIL)'
6292
    construct would be a clearer way for a pattern author to represent a
6293
    non-matching branch, but it does have different semantics to '[]' if both
6294
    are followed by a quantifier). The empty-negated [^] matches any character,
6295
    so is useful: generate OP_ALLANY for this. */
6296
6297
0
    case META_CLASS_EMPTY:
6298
0
    case META_CLASS_EMPTY_NOT:
6299
0
    matched_char = TRUE;
6300
0
    if (meta == META_CLASS_EMPTY_NOT) *code++ = OP_ALLANY;
6301
0
    else
6302
0
      {
6303
0
      *code++ = OP_CLASS;
6304
0
      memset(code, 0, 32);
6305
0
      code += 32 / sizeof(PCRE2_UCHAR);
6306
0
      }
6307
6308
0
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6309
0
    zerofirstcu = firstcu;
6310
0
    zerofirstcuflags = firstcuflags;
6311
0
    break;
6312
6313
6314
    /* ===================================================================*/
6315
    /* Non-empty character class. If the included characters are all < 256, we
6316
    build a 32-byte bitmap of the permitted characters, except in the special
6317
    case where there is only one such character. For negated classes, we build
6318
    the map as usual, then invert it at the end. However, we use a different
6319
    opcode so that data characters > 255 can be handled correctly.
6320
6321
    If the class contains characters outside the 0-255 range, a different
6322
    opcode is compiled. It may optionally have a bit map for characters < 256,
6323
    but those above are explicitly listed afterwards. A flag code unit tells
6324
    whether the bitmap is present, and whether this is a negated class or
6325
    not. */
6326
6327
2.14k
    case META_CLASS_NOT:
6328
8.14k
    case META_CLASS:
6329
8.14k
    matched_char = TRUE;
6330
6331
    /* Check for complex extended classes and handle them separately. */
6332
6333
8.14k
    if ((*pptr & CLASS_IS_ECLASS) != 0)
6334
0
      {
6335
0
      if (!PRIV(compile_class_nested)(options, xoptions, &pptr, &code,
6336
0
                                      errorcodeptr, cb, lengthptr))
6337
0
        return 0;
6338
0
      goto CLASS_END_PROCESSING;
6339
0
      }
6340
6341
    /* We can optimize the case of a single character in a class by generating
6342
    OP_CHAR or OP_CHARI if it's positive, or OP_NOT or OP_NOTI if it's
6343
    negative. In the negative case there can be no first char if this item is
6344
    first, whatever repeat count may follow. In the case of reqcu, save the
6345
    previous value for reinstating. */
6346
6347
    /* NOTE: at present this optimization is not effective if the only
6348
    character in a class in 32-bit, non-UCP mode has its top bit set. */
6349
6350
8.14k
    if (pptr[1] < META_END && pptr[2] == META_CLASS_END)
6351
3.01k
      {
6352
3.01k
      uint32_t c = pptr[1];
6353
6354
3.01k
      pptr += 2;                 /* Move on to class end */
6355
3.01k
      if (meta == META_CLASS)    /* A positive one-char class can be */
6356
1.08k
        {                        /* handled as a normal literal character. */
6357
1.08k
        meta = c;                /* Set up the character */
6358
1.08k
        goto NORMAL_CHAR_SET;
6359
1.08k
        }
6360
6361
      /* Handle a negative one-character class */
6362
6363
1.93k
      zeroreqcu = reqcu;
6364
1.93k
      zeroreqcuflags = reqcuflags;
6365
1.93k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6366
1.93k
      zerofirstcu = firstcu;
6367
1.93k
      zerofirstcuflags = firstcuflags;
6368
6369
      /* For caseless UTF or UCP mode, check whether this character has more
6370
      than one other case. If so, generate a special OP_NOTPROP item instead of
6371
      OP_NOTI. When restricted by PCRE2_EXTRA_CASELESS_RESTRICT, ignore any
6372
      caseless set that starts with an ASCII character. If the character is
6373
      affected by the special Turkish rules, hardcode the not-matching
6374
      characters using a caseset. */
6375
6376
1.93k
#ifdef SUPPORT_UNICODE
6377
1.93k
      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
6378
0
        {
6379
0
        uint32_t caseset;
6380
6381
0
        if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6382
0
              PCRE2_EXTRA_TURKISH_CASING &&
6383
0
            UCD_ANY_I(c))
6384
0
          {
6385
0
          caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(c)? 0 : 3);
6386
0
          }
6387
0
        else if ((caseset = UCD_CASESET(c)) != 0 &&
6388
0
                 (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6389
0
                 PRIV(ucd_caseless_sets)[caseset] < 128)
6390
0
          {
6391
0
          caseset = 0;  /* Ignore the caseless set if it's restricted. */
6392
0
          }
6393
6394
0
        if (caseset != 0)
6395
0
          {
6396
0
          *code++ = OP_NOTPROP;
6397
0
          *code++ = PT_CLIST;
6398
0
          *code++ = caseset;
6399
0
          break;   /* We are finished with this class */
6400
0
          }
6401
0
        }
6402
1.93k
#endif
6403
      /* Char has only one other (usable) case, or UCP not available */
6404
6405
1.93k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_NOTI: OP_NOT;
6406
1.93k
      code += PUTCHAR(c, code);
6407
1.93k
      break;   /* We are finished with this class */
6408
1.93k
      }        /* End of 1-char optimization */
6409
6410
    /* Handle character classes that contain more than just one literal
6411
    character. If there are exactly two characters in a positive class, see if
6412
    they are case partners. This can be optimized to generate a caseless single
6413
    character match (which also sets first/required code units if relevant).
6414
    When casing restrictions apply, ignore a caseless set if both characters
6415
    are ASCII. When Turkish casing applies, an 'i' does not match its normal
6416
    Unicode "othercase". */
6417
6418
5.12k
    if (meta == META_CLASS && pptr[1] < META_END && pptr[2] < META_END &&
6419
2.61k
        pptr[3] == META_CLASS_END)
6420
1.64k
      {
6421
1.64k
      uint32_t c = pptr[1];
6422
6423
1.64k
#ifdef SUPPORT_UNICODE
6424
1.64k
      if ((UCD_CASESET(c) == 0 ||
6425
18
           ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
6426
0
            c < 128 && pptr[2] < 128)) &&
6427
1.62k
          !((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
6428
1.62k
              PCRE2_EXTRA_TURKISH_CASING &&
6429
0
            UCD_ANY_I(c)))
6430
1.62k
#endif
6431
1.62k
        {
6432
1.62k
        uint32_t d;
6433
6434
1.62k
#ifdef SUPPORT_UNICODE
6435
1.62k
        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else
6436
1.62k
#endif
6437
1.62k
          {
6438
#if PCRE2_CODE_UNIT_WIDTH != 8
6439
          if (c > 255) d = c; else
6440
#endif
6441
1.62k
          d = TABLE_GET(c, cb->fcc, c);
6442
1.62k
          }
6443
6444
1.62k
        if (c != d && pptr[2] == d)
6445
2
          {
6446
2
          pptr += 3;                 /* Move on to class end */
6447
2
          meta = c;
6448
2
          if ((options & PCRE2_CASELESS) == 0)
6449
2
            {
6450
2
            reset_caseful = TRUE;
6451
2
            options |= PCRE2_CASELESS;
6452
2
            req_caseopt = REQ_CASELESS;
6453
2
            }
6454
2
          goto CLASS_CASELESS_CHAR;
6455
2
          }
6456
1.62k
        }
6457
1.64k
      }
6458
6459
    /* Now emit the OP_CLASS/OP_NCLASS/OP_XCLASS/OP_ALLANY opcode. */
6460
6461
5.12k
    pptr = PRIV(compile_class_not_nested)(options, xoptions, pptr + 1,
6462
5.12k
                                          &code, meta == META_CLASS_NOT, NULL,
6463
5.12k
                                          errorcodeptr, cb, lengthptr);
6464
5.12k
    if (pptr == NULL) return 0;
6465
5.12k
    PCRE2_ASSERT(*pptr == META_CLASS_END);
6466
6467
5.12k
    CLASS_END_PROCESSING:
6468
6469
    /* If this class is the first thing in the branch, there can be no first
6470
    char setting, whatever the repeat count. Any reqcu setting must remain
6471
    unchanged after any kind of repeat. */
6472
6473
5.12k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6474
5.12k
    zerofirstcu = firstcu;
6475
5.12k
    zerofirstcuflags = firstcuflags;
6476
5.12k
    zeroreqcu = reqcu;
6477
5.12k
    zeroreqcuflags = reqcuflags;
6478
5.12k
    break;  /* End of class processing */
6479
6480
6481
    /* ===================================================================*/
6482
    /* Deal with (*VERB)s. */
6483
6484
    /* Check for open captures before ACCEPT and close those that are within
6485
    the same assertion level, also converting ACCEPT to ASSERT_ACCEPT in an
6486
    assertion. In the first pass, just accumulate the length required;
6487
    otherwise hitting (*ACCEPT) inside many nested parentheses can cause
6488
    workspace overflow. Do not set firstcu after *ACCEPT. */
6489
6490
380
    case META_ACCEPT:
6491
380
    cb->had_accept = had_accept = TRUE;
6492
380
    for (oc = open_caps;
6493
1.46k
         oc != NULL && oc->assert_depth >= cb->assert_depth;
6494
1.08k
         oc = oc->next)
6495
1.08k
      {
6496
1.08k
      if (lengthptr != NULL)
6497
542
        {
6498
542
        *lengthptr += CU2BYTES(1) + IMM2_SIZE;
6499
542
        }
6500
542
      else
6501
542
        {
6502
542
        *code++ = OP_CLOSE;
6503
542
        PUT2INC(code, 0, oc->number);
6504
542
        }
6505
1.08k
      }
6506
380
    *code++ = (cb->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
6507
380
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
6508
380
    break;
6509
6510
179
    case META_PRUNE:
6511
389
    case META_SKIP:
6512
389
    cb->had_pruneorskip = TRUE;
6513
389
    PCRE2_FALLTHROUGH /* Fall through */
6514
405
    case META_COMMIT:
6515
479
    case META_FAIL:
6516
479
    *code++ = verbops[(meta - META_MARK) >> 16];
6517
479
    break;
6518
6519
140
    case META_THEN:
6520
140
    cb->external_flags |= PCRE2_HASTHEN;
6521
140
    *code++ = OP_THEN;
6522
140
    break;
6523
6524
    /* Handle verbs with arguments. Arguments can be very long, especially in
6525
    16- and 32-bit modes, and can overflow the workspace in the first pass.
6526
    However, the argument length is constrained to be small enough to fit in
6527
    one code unit. This check happens in parse_regex(). In the first pass,
6528
    instead of putting the argument into memory, we just update the length
6529
    counter and set up an empty argument. */
6530
6531
248
    case META_THEN_ARG:
6532
248
    cb->external_flags |= PCRE2_HASTHEN;
6533
248
    goto VERB_ARG;
6534
6535
249
    case META_PRUNE_ARG:
6536
504
    case META_SKIP_ARG:
6537
504
    cb->had_pruneorskip = TRUE;
6538
504
    PCRE2_FALLTHROUGH /* Fall through */
6539
1.09k
    case META_MARK:
6540
1.09k
    case META_COMMIT_ARG:
6541
1.34k
    VERB_ARG:
6542
1.34k
    *code++ = verbops[(meta - META_MARK) >> 16];
6543
    /* The length is in characters. */
6544
1.34k
    verbarglen = *(++pptr);
6545
1.34k
    verbculen = 0;
6546
1.34k
    tempcode = code++;
6547
3.16k
    for (i = 0; i < (int)verbarglen; i++)
6548
1.81k
      {
6549
1.81k
      meta = *(++pptr);
6550
1.81k
#ifdef SUPPORT_UNICODE
6551
1.81k
      if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
6552
1.81k
#endif
6553
1.81k
        {
6554
1.81k
        mclength = 1;
6555
1.81k
        mcbuffer[0] = meta;
6556
1.81k
        }
6557
1.81k
      if (lengthptr != NULL) *lengthptr += mclength; else
6558
892
        {
6559
892
        memcpy(code, mcbuffer, CU2BYTES(mclength));
6560
892
        code += mclength;
6561
892
        verbculen += mclength;
6562
892
        }
6563
1.81k
      }
6564
6565
1.34k
    *tempcode = verbculen;   /* Fill in the code unit length */
6566
1.34k
    *code++ = 0;             /* Terminating zero */
6567
1.34k
    break;
6568
6569
6570
    /* ===================================================================*/
6571
    /* Handle options change. The new setting must be passed back for use in
6572
    subsequent branches. Reset the greedy defaults and the case value for
6573
    firstcu and reqcu. */
6574
6575
1.63k
    case META_OPTIONS:
6576
1.63k
    *optionsptr = options = *(++pptr);
6577
1.63k
    *xoptionsptr = xoptions = *(++pptr);
6578
1.63k
    greedy_default = ((options & PCRE2_UNGREEDY) != 0);
6579
1.63k
    greedy_non_default = greedy_default ^ 1;
6580
1.63k
    req_caseopt = ((options & PCRE2_CASELESS) != 0)? REQ_CASELESS : 0;
6581
1.63k
    break;
6582
6583
    /* ===================================================================*/
6584
    /* Handle scan substring. Scan substring assertion starts with META_SCS,
6585
    which recursively calls compile_branch. The first opcode processed by
6586
    this recursive call is always META_OFFSET. */
6587
6588
0
    case META_OFFSET:
6589
0
    if (lengthptr != NULL)
6590
0
      {
6591
0
      pptr = PRIV(compile_parse_scan_substr_args)(pptr, errorcodeptr, cb, lengthptr);
6592
0
      if (pptr == NULL)
6593
0
        return 0;
6594
0
      break;
6595
0
      }
6596
6597
0
    while (TRUE)
6598
0
      {
6599
0
      int count, index;
6600
0
      named_group *ng;
6601
6602
0
      switch (META_CODE(*pptr))
6603
0
        {
6604
0
        case META_OFFSET:
6605
0
        pptr++;
6606
0
        SKIPOFFSET(pptr);
6607
0
        continue;
6608
6609
0
        case META_CAPTURE_NAME:
6610
0
        ng = cb->named_groups + pptr[1];
6611
0
        pptr += 2;
6612
0
        count = 0;
6613
0
        index = 0;
6614
6615
0
        if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6616
0
          &count, errorcodeptr, cb)) return 0;
6617
6618
0
        code[0] = OP_DNCREF;
6619
0
        PUT2(code, 1, index);
6620
0
        PUT2(code, 1 + IMM2_SIZE, count);
6621
0
        code += 1 + 2 * IMM2_SIZE;
6622
0
        continue;
6623
6624
0
        case META_CAPTURE_NUMBER:
6625
0
        pptr += 2;
6626
0
        if (pptr[-1] == 0) continue;
6627
6628
0
        code[0] = OP_CREF;
6629
0
        PUT2(code, 1, pptr[-1]);
6630
0
        code += 1 + IMM2_SIZE;
6631
0
        continue;
6632
6633
0
        default:
6634
0
        break;
6635
0
        }
6636
6637
0
      break;
6638
0
      }
6639
0
    --pptr;
6640
0
    break;
6641
6642
0
    case META_SCS:
6643
0
    bravalue = OP_ASSERT_SCS;
6644
0
    cb->assert_depth += 1;
6645
0
    goto GROUP_PROCESS;
6646
6647
6648
    /* ===================================================================*/
6649
    /* Handle conditional subpatterns. The case of (?(Rdigits) is ambiguous
6650
    because it could be a numerical check on recursion, or a name check on a
6651
    group's being set. The pre-pass sets up META_COND_RNUMBER as a name so that
6652
    we can handle it either way. We first try for a name; if not found, process
6653
    the number. */
6654
6655
1.98k
    case META_COND_RNUMBER:   /* (?(Rdigits) */
6656
2.75k
    case META_COND_NAME:      /* (?(name) or (?'name') or ?(<name>) */
6657
2.98k
    case META_COND_RNAME:     /* (?(R&name) - test for recursion */
6658
2.98k
    bravalue = OP_COND;
6659
6660
2.98k
    if (lengthptr != NULL)
6661
1.60k
      {
6662
1.60k
      uint32_t i;
6663
1.60k
      PCRE2_SPTR name;
6664
1.60k
      named_group *ng;
6665
1.60k
      uint32_t *start_pptr = pptr;
6666
1.60k
      uint32_t length = *(++pptr);
6667
6668
1.60k
      GETPLUSOFFSET(offset, pptr);
6669
1.60k
      name = cb->start_pattern + offset;
6670
6671
      /* In the first pass, the names generated in the pre-pass are available,
6672
      but the main name table has not yet been created. Scan the list of names
6673
      generated in the pre-pass in order to get a number and whether or not
6674
      this name is duplicated. If it is not duplicated, we can handle it as a
6675
      numerical group. */
6676
6677
1.60k
      ng = PRIV(compile_find_named_group)(name, length, cb);
6678
6679
1.60k
      if (ng == NULL)
6680
1.18k
        {
6681
        /* If the name was not found we have a bad reference, unless we are
6682
        dealing with R<digits>, which is treated as a recursion test by
6683
        number. */
6684
6685
1.18k
        groupnumber = 0;
6686
1.18k
        if (meta == META_COND_RNUMBER)
6687
1.02k
          {
6688
1.28k
          for (i = 1; i < length; i++)
6689
297
            {
6690
297
            groupnumber = groupnumber * 10 + (name[i] - CHAR_0);
6691
297
            if (groupnumber > MAX_GROUP_NUMBER)
6692
35
              {
6693
35
              *errorcodeptr = ERR61;
6694
35
              cb->erroroffset = offset + i;
6695
35
              return 0;
6696
35
              }
6697
297
            }
6698
1.02k
          }
6699
6700
1.15k
        if (meta != META_COND_RNUMBER || groupnumber > cb->bracount)
6701
193
          {
6702
193
          *errorcodeptr = ERR15;
6703
193
          cb->erroroffset = offset;
6704
193
          return 0;
6705
193
          }
6706
6707
        /* (?Rdigits) treated as a recursion reference by number. A value of
6708
        zero (which is the result of both (?R) and (?R0)) means "any", and is
6709
        translated into RREF_ANY (which is 0xffff). */
6710
6711
959
        if (groupnumber == 0) groupnumber = RREF_ANY;
6712
959
        PCRE2_ASSERT(start_pptr[0] == META_COND_RNUMBER);
6713
959
        start_pptr[1] = groupnumber;
6714
959
        skipunits = 1+IMM2_SIZE;
6715
959
        goto GROUP_PROCESS_NOTE_EMPTY;
6716
1.15k
        }
6717
6718
      /* From here on, we know we have a name (not a number),
6719
      so treat META_COND_RNUMBER the same as META_COND_NAME. */
6720
421
      if (meta == META_COND_RNUMBER) meta = META_COND_NAME;
6721
6722
421
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
6723
296
        {
6724
        /* Found a non-duplicated name. Since it is a global,
6725
        it is enough to update it in the pre-processing phase. */
6726
296
        if (ng->number > cb->top_backref) cb->top_backref = ng->number;
6727
6728
296
        start_pptr[0] = meta;
6729
296
        start_pptr[1] = ng->number;
6730
6731
296
        skipunits = 1 + IMM2_SIZE;
6732
296
        goto GROUP_PROCESS_NOTE_EMPTY;
6733
296
        }
6734
6735
      /* We have a duplicated name. In the compile pass we have to search the
6736
      main table in order to get the index and count values. */
6737
6738
125
      start_pptr[0] = meta | 1;
6739
125
      start_pptr[1] = (uint32_t)(ng - cb->named_groups);
6740
6741
      /* A duplicated name was found. Note that if an R<digits> name is found
6742
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6743
125
      skipunits = 1 + 2 * IMM2_SIZE;
6744
125
      }
6745
1.37k
    else
6746
1.37k
      {
6747
      /* Otherwise lengthptr equals to NULL,
6748
      which is the second phase of compilation. */
6749
1.37k
      int count, index;
6750
1.37k
      named_group *ng;
6751
6752
      /* Generate code using the data
6753
      collected in the pre-processing phase. */
6754
6755
1.37k
      if (meta == META_COND_RNUMBER)
6756
958
        {
6757
958
        code[1+LINK_SIZE] = OP_RREF;
6758
958
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6759
958
        skipunits = 1 + IMM2_SIZE;
6760
958
        pptr += 1 + SIZEOFFSET;
6761
958
        goto GROUP_PROCESS_NOTE_EMPTY;
6762
958
        }
6763
6764
415
      if (meta_arg == 0)
6765
290
        {
6766
290
        code[1+LINK_SIZE] = (meta == META_COND_RNAME)? OP_RREF : OP_CREF;
6767
290
        PUT2(code, 2 + LINK_SIZE, pptr[1]);
6768
290
        skipunits = 1 + IMM2_SIZE;
6769
290
        pptr += 1 + SIZEOFFSET;
6770
290
        goto GROUP_PROCESS_NOTE_EMPTY;
6771
290
        }
6772
6773
125
      ng = cb->named_groups + pptr[1];
6774
125
      count = 0;  /* Values for first pass (avoids compiler warning) */
6775
125
      index = 0;
6776
6777
      /* The failed case is an internal error. */
6778
125
      if (!PRIV(compile_find_dupname_details)(ng->name, ng->length, &index,
6779
125
            &count, errorcodeptr, cb)) return 0;
6780
6781
      /* A duplicated name was found. Note that if an R<digits> name is found
6782
      (META_COND_RNUMBER), it is a reference test, not a recursion test. */
6783
6784
125
      code[1 + LINK_SIZE] = (meta == META_COND_RNAME)? OP_DNRREF : OP_DNCREF;
6785
6786
      /* Insert appropriate data values. */
6787
125
      PUT2(code, 2 + LINK_SIZE, index);
6788
125
      PUT2(code, 2 + LINK_SIZE + IMM2_SIZE, count);
6789
125
      skipunits = 1 + 2 * IMM2_SIZE;
6790
125
      pptr += 1 + SIZEOFFSET;
6791
125
      }
6792
6793
250
    PCRE2_ASSERT(meta != META_CAPTURE_NAME);
6794
250
    goto GROUP_PROCESS_NOTE_EMPTY;
6795
6796
    /* The DEFINE condition is always false. Its internal groups may never
6797
    be called, so matched_char must remain false, hence the jump to
6798
    GROUP_PROCESS rather than GROUP_PROCESS_NOTE_EMPTY. */
6799
6800
508
    case META_COND_DEFINE:
6801
508
    bravalue = OP_COND;
6802
508
    GETPLUSOFFSET(offset, pptr);
6803
508
    code[1+LINK_SIZE] = OP_DEFINE;
6804
508
    skipunits = 1;
6805
508
    goto GROUP_PROCESS;
6806
6807
    /* Conditional test of a group's being set. */
6808
6809
860
    case META_COND_NUMBER:
6810
860
    bravalue = OP_COND;
6811
860
    GETPLUSOFFSET(offset, pptr);
6812
6813
860
    groupnumber = *(++pptr);
6814
860
    if (groupnumber > cb->bracount)
6815
25
      {
6816
25
      *errorcodeptr = ERR15;
6817
25
      cb->erroroffset = offset;
6818
25
      return 0;
6819
25
      }
6820
835
    if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6821
6822
    /* Point at initial ( for too many branches error */
6823
835
    offset -= 2;
6824
835
    code[1+LINK_SIZE] = OP_CREF;
6825
835
    skipunits = 1+IMM2_SIZE;
6826
835
    PUT2(code, 2+LINK_SIZE, groupnumber);
6827
835
    goto GROUP_PROCESS_NOTE_EMPTY;
6828
6829
    /* Test for the PCRE2 version. */
6830
6831
0
    case META_COND_VERSION:
6832
0
    bravalue = OP_COND;
6833
0
    if (pptr[1] > 0)
6834
0
      code[1+LINK_SIZE] = ((PCRE2_MAJOR > pptr[2]) ||
6835
0
        (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR >= pptr[3]))?
6836
0
          OP_TRUE : OP_FALSE;
6837
0
    else
6838
0
      code[1+LINK_SIZE] = (PCRE2_MAJOR == pptr[2] && PCRE2_MINOR == pptr[3])?
6839
0
        OP_TRUE : OP_FALSE;
6840
0
    skipunits = 1;
6841
0
    pptr += 3;
6842
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6843
6844
    /* The condition is an assertion, possibly preceded by a callout. */
6845
6846
1.63k
    case META_COND_ASSERT:
6847
1.63k
    bravalue = OP_COND;
6848
1.63k
    goto GROUP_PROCESS_NOTE_EMPTY;
6849
6850
6851
    /* ===================================================================*/
6852
    /* Handle all kinds of nested bracketed groups. The non-capturing,
6853
    non-conditional cases are here; others come to GROUP_PROCESS via goto. */
6854
6855
4.96k
    case META_LOOKAHEAD:
6856
4.96k
    bravalue = OP_ASSERT;
6857
4.96k
    cb->assert_depth += 1;
6858
4.96k
    goto GROUP_PROCESS;
6859
6860
0
    case META_LOOKAHEAD_NA:
6861
0
    bravalue = OP_ASSERT_NA;
6862
0
    cb->assert_depth += 1;
6863
0
    goto GROUP_PROCESS;
6864
6865
    /* Optimize (?!) to (*FAIL) unless it is quantified - which is a weird
6866
    thing to do, but Perl allows all assertions to be quantified, and when
6867
    they contain capturing parentheses there may be a potential use for
6868
    this feature. Not that that applies to a quantified (?!) but we allow
6869
    it for uniformity. */
6870
6871
2.77k
    case META_LOOKAHEADNOT:
6872
2.77k
    if (pptr[1] == META_KET &&
6873
2.11k
         (pptr[2] < META_ASTERISK || pptr[2] > META_MINMAX_QUERY))
6874
922
      {
6875
922
      *code++ = OP_FAIL;
6876
922
      pptr++;
6877
922
      }
6878
1.84k
    else
6879
1.84k
      {
6880
1.84k
      bravalue = OP_ASSERT_NOT;
6881
1.84k
      cb->assert_depth += 1;
6882
1.84k
      goto GROUP_PROCESS;
6883
1.84k
      }
6884
922
    break;
6885
6886
922
    case META_LOOKBEHIND:
6887
685
    bravalue = OP_ASSERTBACK;
6888
685
    cb->assert_depth += 1;
6889
685
    goto GROUP_PROCESS;
6890
6891
1.33k
    case META_LOOKBEHINDNOT:
6892
1.33k
    bravalue = OP_ASSERTBACK_NOT;
6893
1.33k
    cb->assert_depth += 1;
6894
1.33k
    goto GROUP_PROCESS;
6895
6896
0
    case META_LOOKBEHIND_NA:
6897
0
    bravalue = OP_ASSERTBACK_NA;
6898
0
    cb->assert_depth += 1;
6899
0
    goto GROUP_PROCESS;
6900
6901
2.74k
    case META_ATOMIC:
6902
2.74k
    bravalue = OP_ONCE;
6903
2.74k
    goto GROUP_PROCESS_NOTE_EMPTY;
6904
6905
0
    case META_SCRIPT_RUN:
6906
0
    bravalue = OP_SCRIPT_RUN;
6907
0
    goto GROUP_PROCESS_NOTE_EMPTY;
6908
6909
2.97k
    case META_NOCAPTURE:
6910
2.97k
    bravalue = OP_BRA;
6911
    /* Fall through */
6912
6913
    /* Process nested bracketed regex. The nesting depth is maintained for the
6914
    benefit of the stackguard function. The test for too deep nesting is now
6915
    done in parse_regex(). Assertion and DEFINE groups come to GROUP_PROCESS;
6916
    others come to GROUP_PROCESS_NOTE_EMPTY, to indicate that we need to take
6917
    note of whether or not they may match an empty string. */
6918
6919
57.5k
    GROUP_PROCESS_NOTE_EMPTY:
6920
57.5k
    note_group_empty = TRUE;
6921
6922
66.9k
    GROUP_PROCESS:
6923
66.9k
    cb->parens_depth += 1;
6924
66.9k
    *code = bravalue;
6925
66.9k
    pptr++;
6926
66.9k
    tempcode = code;
6927
66.9k
    tempreqvary = cb->req_varyopt;        /* Save value before group */
6928
66.9k
    length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6929
6930
66.9k
    if ((group_return =
6931
66.9k
         compile_regex(
6932
66.9k
         options,                         /* The options state */
6933
66.9k
         xoptions,                        /* The extra options state */
6934
66.9k
         &tempcode,                       /* Where to put code (updated) */
6935
66.9k
         &pptr,                           /* Input pointer (updated) */
6936
66.9k
         errorcodeptr,                    /* Where to put an error message */
6937
66.9k
         skipunits,                       /* Skip over bracket number */
6938
66.9k
         &subfirstcu,                     /* For possible first char */
6939
66.9k
         &subfirstcuflags,
6940
66.9k
         &subreqcu,                       /* For possible last char */
6941
66.9k
         &subreqcuflags,
6942
66.9k
         bcptr,                           /* Current branch chain */
6943
66.9k
         open_caps,                       /* Pointer to capture stack */
6944
66.9k
         cb,                              /* Compile data block */
6945
66.9k
         (lengthptr == NULL)? NULL :      /* Actual compile phase */
6946
66.9k
           &length_prevgroup              /* Pre-compile phase */
6947
66.9k
         )) == 0)
6948
397
      return 0;  /* Error */
6949
6950
66.5k
    cb->parens_depth -= 1;
6951
6952
    /* If that was a non-conditional significant group (not an assertion, not a
6953
    DEFINE) that matches at least one character, then the current item matches
6954
    a character. Conditionals are handled below. */
6955
6956
66.5k
    if (note_group_empty && bravalue != OP_COND && group_return > 0)
6957
13.1k
      matched_char = TRUE;
6958
6959
    /* If we've just compiled an assertion, pop the assert depth. */
6960
6961
66.5k
    if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
6962
8.82k
      cb->assert_depth -= 1;
6963
6964
    /* At the end of compiling, code is still pointing to the start of the
6965
    group, while tempcode has been updated to point past the end of the group.
6966
    The parsed pattern pointer (pptr) is on the closing META_KET.
6967
6968
    If this is a conditional bracket, check that there are no more than
6969
    two branches in the group, or just one if it's a DEFINE group. We do this
6970
    in the real compile phase, not in the pre-pass, where the whole group may
6971
    not be available. */
6972
6973
66.5k
    if (bravalue == OP_COND && lengthptr == NULL)
6974
2.84k
      {
6975
2.84k
      PCRE2_UCHAR *tc = code;
6976
2.84k
      int condcount = 0;
6977
6978
3.51k
      do {
6979
3.51k
         condcount++;
6980
3.51k
         tc += GET(tc,1);
6981
3.51k
         }
6982
3.51k
      while (*tc != OP_KET);
6983
6984
      /* A DEFINE group is never obeyed inline (the "condition" is always
6985
      false). It must have only one branch. Having checked this, change the
6986
      opcode to OP_FALSE. */
6987
6988
2.84k
      if (code[LINK_SIZE+1] == OP_DEFINE)
6989
246
        {
6990
246
        if (condcount > 1)
6991
5
          {
6992
5
          cb->erroroffset = offset;
6993
5
          *errorcodeptr = ERR54;
6994
5
          return 0;
6995
5
          }
6996
241
        code[LINK_SIZE+1] = OP_FALSE;
6997
241
        bravalue = OP_DEFINE;   /* A flag to suppress char handling below */
6998
241
        }
6999
7000
      /* A "normal" conditional group. If there is just one branch, we must not
7001
      make use of its firstcu or reqcu, because this is equivalent to an
7002
      empty second branch. Also, it may match an empty string. If there are two
7003
      branches, this item must match a character if the group must. */
7004
7005
2.60k
      else
7006
2.60k
        {
7007
2.60k
        if (condcount > 2)
7008
9
          {
7009
9
          cb->erroroffset = offset;
7010
9
          *errorcodeptr = ERR27;
7011
9
          return 0;
7012
9
          }
7013
2.59k
        if (condcount == 1) subfirstcuflags = subreqcuflags = REQ_NONE;
7014
508
          else if (group_return > 0) matched_char = TRUE;
7015
2.59k
        }
7016
2.84k
      }
7017
7018
    /* In the pre-compile phase, update the length by the length of the group,
7019
    less the brackets at either end. Then reduce the compiled code to just a
7020
    set of non-capturing brackets so that it doesn't use much memory if it is
7021
    duplicated by a quantifier.*/
7022
7023
66.4k
    if (lengthptr != NULL)
7024
33.7k
      {
7025
33.7k
      if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
7026
0
        {
7027
0
        *errorcodeptr = ERR20;
7028
0
        return 0;
7029
0
        }
7030
33.7k
      *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
7031
33.7k
      code++;   /* This already contains bravalue */
7032
33.7k
      PUTINC(code, 0, 1 + LINK_SIZE);
7033
33.7k
      *code++ = OP_KET;
7034
33.7k
      PUTINC(code, 0, 1 + LINK_SIZE);
7035
33.7k
      break;    /* No need to waste time with special character handling */
7036
33.7k
      }
7037
7038
    /* Otherwise update the main code pointer to the end of the group. */
7039
7040
32.7k
    code = tempcode;
7041
7042
    /* For a DEFINE group, required and first character settings are not
7043
    relevant. */
7044
7045
32.7k
    if (bravalue == OP_DEFINE) break;
7046
7047
    /* Handle updating of the required and first code units for other types of
7048
    group. Update for normal brackets of all kinds, and conditions with two
7049
    branches (see code above). If the bracket is followed by a quantifier with
7050
    zero repeat, we have to back off. Hence the definition of zeroreqcu and
7051
    zerofirstcu outside the main loop so that they can be accessed for the back
7052
    off. */
7053
7054
32.4k
    zeroreqcu = reqcu;
7055
32.4k
    zeroreqcuflags = reqcuflags;
7056
32.4k
    zerofirstcu = firstcu;
7057
32.4k
    zerofirstcuflags = firstcuflags;
7058
32.4k
    groupsetfirstcu = FALSE;
7059
7060
32.4k
    if (bravalue >= OP_ONCE)  /* Not an assertion */
7061
28.0k
      {
7062
      /* If we have not yet set a firstcu in this branch, take it from the
7063
      subpattern, remembering that it was set here so that a repeat of more
7064
      than one can replicate it as reqcu if necessary. If the subpattern has
7065
      no firstcu, set "none" for the whole branch. In both cases, a zero
7066
      repeat forces firstcu to "none". */
7067
7068
28.0k
      if (firstcuflags == REQ_UNSET && subfirstcuflags != REQ_UNSET)
7069
6.85k
        {
7070
6.85k
        if (subfirstcuflags < REQ_NONE)
7071
2.62k
          {
7072
2.62k
          firstcu = subfirstcu;
7073
2.62k
          firstcuflags = subfirstcuflags;
7074
2.62k
          groupsetfirstcu = TRUE;
7075
2.62k
          }
7076
4.22k
        else firstcuflags = REQ_NONE;
7077
6.85k
        zerofirstcuflags = REQ_NONE;
7078
6.85k
        }
7079
7080
      /* If firstcu was previously set, convert the subpattern's firstcu
7081
      into reqcu if there wasn't one, using the vary flag that was in
7082
      existence beforehand. */
7083
7084
21.2k
      else if (subfirstcuflags < REQ_NONE && subreqcuflags >= REQ_NONE)
7085
525
        {
7086
525
        subreqcu = subfirstcu;
7087
525
        subreqcuflags = subfirstcuflags | tempreqvary;
7088
525
        }
7089
7090
      /* If the subpattern set a required code unit (or set a first code unit
7091
      that isn't really the first code unit - see above), set it. */
7092
7093
28.0k
      if (subreqcuflags < REQ_NONE)
7094
4.57k
        {
7095
4.57k
        reqcu = subreqcu;
7096
4.57k
        reqcuflags = subreqcuflags;
7097
4.57k
        }
7098
28.0k
      }
7099
7100
    /* For a forward assertion, we take the reqcu, if set, provided that the
7101
    group has also set a firstcu. This can be helpful if the pattern that
7102
    follows the assertion doesn't set a different char. For example, it's
7103
    useful for /(?=abcde).+/. We can't set firstcu for an assertion, however
7104
    because it leads to incorrect effect for patterns such as /(?=a)a.+/ when
7105
    the "real" "a" would then become a reqcu instead of a firstcu. This is
7106
    overcome by a scan at the end if there's no firstcu, looking for an
7107
    asserted first char. A similar effect for patterns like /(?=.*X)X$/ means
7108
    we must only take the reqcu when the group also set a firstcu. Otherwise,
7109
    in that example, 'X' ends up set for both. */
7110
7111
4.40k
    else if ((bravalue == OP_ASSERT || bravalue == OP_ASSERT_NA) &&
7112
2.47k
             subreqcuflags < REQ_NONE && subfirstcuflags < REQ_NONE)
7113
482
      {
7114
482
      reqcu = subreqcu;
7115
482
      reqcuflags = subreqcuflags;
7116
482
      }
7117
7118
32.4k
    break;  /* End of nested group handling */
7119
7120
7121
    /* ===================================================================*/
7122
    /* Handle named backreferences and recursions. */
7123
7124
2.49k
    case META_BACKREF_BYNAME:
7125
2.70k
    case META_RECURSE_BYNAME:
7126
2.70k
      {
7127
2.70k
      int count, index;
7128
2.70k
      PCRE2_SPTR name;
7129
2.70k
      named_group *ng;
7130
2.70k
      uint32_t length = *(++pptr);
7131
7132
2.70k
      GETPLUSOFFSET(offset, pptr);
7133
2.70k
      name = cb->start_pattern + offset;
7134
7135
      /* In the first pass, the names generated in the pre-pass are available,
7136
      but the main name table has not yet been created. Scan the list of names
7137
      generated in the pre-pass in order to get a number and whether or not
7138
      this name is duplicated. */
7139
7140
2.70k
      ng = PRIV(compile_find_named_group)(name, length, cb);
7141
7142
2.70k
      if (ng == NULL)
7143
160
        {
7144
        /* If the name was not found we have a bad reference. */
7145
160
        *errorcodeptr = ERR15;
7146
160
        cb->erroroffset = offset;
7147
160
        return 0;
7148
160
        }
7149
7150
2.54k
      groupnumber = ng->number;
7151
7152
      /* For a recursion, that's all that is needed. We can now go to
7153
      the code that handles numerical recursion, applying it to the first
7154
      group with the given name. */
7155
7156
2.54k
      if (meta == META_RECURSE_BYNAME)
7157
82
        {
7158
82
        meta_arg = groupnumber;
7159
82
        goto HANDLE_NUMERICAL_RECURSION;
7160
82
        }
7161
7162
      /* For a back reference, update the back reference map and the
7163
      maximum back reference. */
7164
7165
2.46k
      cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;
7166
2.46k
      if (groupnumber > cb->top_backref)
7167
139
        cb->top_backref = groupnumber;
7168
7169
      /* If a back reference name is not duplicated, we can handle it as
7170
      a numerical reference. */
7171
7172
2.46k
      if ((ng->hash_dup & NAMED_GROUP_IS_DUPNAME) == 0)
7173
399
        {
7174
399
        meta_arg = groupnumber;
7175
399
        goto HANDLE_SINGLE_REFERENCE;
7176
399
        }
7177
7178
      /* If a back reference name is duplicated, we generate a different
7179
      opcode to a numerical back reference. In the second pass we must
7180
      search for the index and count in the final name table. */
7181
7182
2.06k
      count = 0;  /* Values for first pass (avoids compiler warning) */
7183
2.06k
      index = 0;
7184
2.06k
      if (lengthptr == NULL && !PRIV(compile_find_dupname_details)(name, length,
7185
967
            &index, &count, errorcodeptr, cb)) return 0;
7186
7187
2.06k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
7188
2.06k
      *code++ = ((options & PCRE2_CASELESS) != 0)? OP_DNREFI : OP_DNREF;
7189
2.06k
      PUT2INC(code, 0, index);
7190
2.06k
      PUT2INC(code, 0, count);
7191
2.06k
      if ((options & PCRE2_CASELESS) != 0)
7192
328
        *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
7193
328
                   REFI_FLAG_CASELESS_RESTRICT : 0) |
7194
328
                  (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
7195
328
                   REFI_FLAG_TURKISH_CASING : 0);
7196
2.06k
      }
7197
0
    break;
7198
7199
7200
    /* ===================================================================*/
7201
    /* Handle a numerical callout. */
7202
7203
1.45k
    case META_CALLOUT_NUMBER:
7204
1.45k
    code[0] = OP_CALLOUT;
7205
1.45k
    PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7206
1.45k
    PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7207
1.45k
    code[1 + 2*LINK_SIZE] = pptr[3];
7208
1.45k
    pptr += 3;
7209
1.45k
    code += PRIV(OP_lengths)[OP_CALLOUT];
7210
1.45k
    break;
7211
7212
7213
    /* ===================================================================*/
7214
    /* Handle a callout with a string argument. In the pre-pass we just compute
7215
    the length without generating anything. The length in pptr[3] includes both
7216
    delimiters; in the actual compile only the first one is copied, but a
7217
    terminating zero is added. Any doubled delimiters within the string make
7218
    this an overestimate, but it is not worth bothering about. */
7219
7220
0
    case META_CALLOUT_STRING:
7221
0
    if (lengthptr != NULL)
7222
0
      {
7223
0
      *lengthptr += pptr[3] + (1 + 4*LINK_SIZE);
7224
0
      pptr += 3;
7225
0
      SKIPOFFSET(pptr);
7226
0
      }
7227
7228
    /* In the real compile we can copy the string. The starting delimiter is
7229
     included so that the client can discover it if they want. We also pass the
7230
     start offset to help a script language give better error messages. */
7231
7232
0
    else
7233
0
      {
7234
0
      PCRE2_SPTR pp;
7235
0
      uint32_t delimiter;
7236
0
      uint32_t length = pptr[3];
7237
0
      PCRE2_UCHAR *callout_string = code + (1 + 4*LINK_SIZE);
7238
7239
0
      code[0] = OP_CALLOUT_STR;
7240
0
      PUT(code, 1, pptr[1]);               /* Offset to next pattern item */
7241
0
      PUT(code, 1 + LINK_SIZE, pptr[2]);   /* Length of next pattern item */
7242
7243
0
      pptr += 3;
7244
0
      GETPLUSOFFSET(offset, pptr);         /* Offset to string in pattern */
7245
0
      pp = cb->start_pattern + offset;
7246
0
      delimiter = *callout_string++ = *pp++;
7247
0
      if (delimiter == CHAR_LEFT_CURLY_BRACKET)
7248
0
        delimiter = CHAR_RIGHT_CURLY_BRACKET;
7249
0
      PUT(code, 1 + 3*LINK_SIZE, (int)(offset + 1));  /* One after delimiter */
7250
7251
      /* The syntax of the pattern was checked in the parsing scan. The length
7252
      includes both delimiters, but we have passed the opening one just above,
7253
      so we reduce length before testing it. The test is for > 1 because we do
7254
      not want to copy the final delimiter. This also ensures that pp[1] is
7255
      accessible. */
7256
7257
0
      while (--length > 1)
7258
0
        {
7259
0
        if (*pp == delimiter && pp[1] == delimiter)
7260
0
          {
7261
0
          *callout_string++ = delimiter;
7262
0
          pp += 2;
7263
0
          length--;
7264
0
          }
7265
0
        else *callout_string++ = *pp++;
7266
0
        }
7267
0
      *callout_string++ = CHAR_NUL;
7268
7269
      /* Set the length of the entire item, the advance to its end. */
7270
7271
0
      PUT(code, 1 + 2*LINK_SIZE, (int)(callout_string - code));
7272
0
      code = callout_string;
7273
0
      }
7274
0
    break;
7275
7276
7277
    /* ===================================================================*/
7278
    /* Handle repetition. The different types are all sorted out in the parsing
7279
    pass. */
7280
7281
2.50k
    case META_MINMAX_PLUS:
7282
2.65k
    case META_MINMAX_QUERY:
7283
12.7k
    case META_MINMAX:
7284
12.7k
    repeat_min = *(++pptr);
7285
12.7k
    repeat_max = *(++pptr);
7286
12.7k
    goto REPEAT;
7287
7288
6.92k
    case META_ASTERISK:
7289
7.13k
    case META_ASTERISK_PLUS:
7290
7.84k
    case META_ASTERISK_QUERY:
7291
7.84k
    repeat_min = 0;
7292
7.84k
    repeat_max = REPEAT_UNLIMITED;
7293
7.84k
    goto REPEAT;
7294
7295
7.78k
    case META_PLUS:
7296
10.3k
    case META_PLUS_PLUS:
7297
10.5k
    case META_PLUS_QUERY:
7298
10.5k
    repeat_min = 1;
7299
10.5k
    repeat_max = REPEAT_UNLIMITED;
7300
10.5k
    goto REPEAT;
7301
7302
4.72k
    case META_QUERY:
7303
5.63k
    case META_QUERY_PLUS:
7304
5.89k
    case META_QUERY_QUERY:
7305
5.89k
    repeat_min = 0;
7306
5.89k
    repeat_max = 1;
7307
7308
37.0k
    REPEAT:
7309
37.0k
    if (previous_matched_char && repeat_min > 0) matched_char = TRUE;
7310
7311
    /* Remember whether this is a variable length repeat, and default to
7312
    single-char opcodes. */
7313
7314
37.0k
    reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
7315
7316
    /* Adjust first and required code units for a zero repeat. */
7317
7318
37.0k
    if (repeat_min == 0)
7319
15.1k
      {
7320
15.1k
      firstcu = zerofirstcu;
7321
15.1k
      firstcuflags = zerofirstcuflags;
7322
15.1k
      reqcu = zeroreqcu;
7323
15.1k
      reqcuflags = zeroreqcuflags;
7324
15.1k
      }
7325
7326
    /* Note the greediness and possessiveness. */
7327
7328
37.0k
    switch (meta)
7329
37.0k
      {
7330
2.50k
      case META_MINMAX_PLUS:
7331
2.71k
      case META_ASTERISK_PLUS:
7332
5.28k
      case META_PLUS_PLUS:
7333
6.20k
      case META_QUERY_PLUS:
7334
6.20k
      repeat_type = 0;                  /* Force greedy */
7335
6.20k
      possessive_quantifier = TRUE;
7336
6.20k
      break;
7337
7338
148
      case META_MINMAX_QUERY:
7339
859
      case META_ASTERISK_QUERY:
7340
1.08k
      case META_PLUS_QUERY:
7341
1.34k
      case META_QUERY_QUERY:
7342
1.34k
      repeat_type = greedy_non_default;
7343
1.34k
      possessive_quantifier = FALSE;
7344
1.34k
      break;
7345
7346
29.4k
      default:
7347
29.4k
      repeat_type = greedy_default;
7348
29.4k
      possessive_quantifier = FALSE;
7349
29.4k
      break;
7350
37.0k
      }
7351
7352
    /* Save start of previous item, in case we have to move it up in order to
7353
    insert something before it, and remember what it was. */
7354
7355
37.0k
    PCRE2_ASSERT(previous != NULL);
7356
37.0k
    tempcode = previous;
7357
37.0k
    op_previous = *previous;
7358
7359
    /* Now handle repetition for the different types of item. If the repeat
7360
    minimum and the repeat maximum are both 1, we can ignore the quantifier for
7361
    non-parenthesized items, as they have only one alternative. For anything in
7362
    parentheses, we must not ignore if {1} is possessive. */
7363
7364
37.0k
    switch (op_previous)
7365
37.0k
      {
7366
      /* If previous was a character or negated character match, abolish the
7367
      item and generate a repeat item instead. If a char item has a minimum of
7368
      more than one, ensure that it is set in reqcu - it might not be if a
7369
      sequence such as x{3} is the first thing in a branch because the x will
7370
      have gone into firstcu instead.  */
7371
7372
7.92k
      case OP_CHAR:
7373
10.4k
      case OP_CHARI:
7374
11.1k
      case OP_NOT:
7375
11.9k
      case OP_NOTI:
7376
11.9k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7377
11.7k
      op_type = chartypeoffset[op_previous - OP_CHAR];
7378
7379
      /* Deal with UTF characters that take up more than one code unit. */
7380
7381
11.7k
#ifdef MAYBE_UTF_MULTI
7382
11.7k
      if (utf && NOT_FIRSTCU(code[-1]))
7383
0
        {
7384
0
        PCRE2_UCHAR *lastchar = code - 1;
7385
0
        BACKCHAR(lastchar);
7386
0
        mclength = (uint32_t)(code - lastchar);   /* Length of UTF character */
7387
0
        memcpy(mcbuffer, lastchar, CU2BYTES(mclength));  /* Save the char */
7388
0
        }
7389
11.7k
      else
7390
11.7k
#endif  /* MAYBE_UTF_MULTI */
7391
7392
      /* Handle the case of a single code unit - either with no UTF support, or
7393
      with UTF disabled, or for a single-code-unit UTF character. In the latter
7394
      case, for a repeated positive match, get the caseless flag for the
7395
      required code unit from the previous character, because a class like [Aa]
7396
      sets a caseless A but by now the req_caseopt flag has been reset. */
7397
7398
11.7k
        {
7399
11.7k
        mcbuffer[0] = code[-1];
7400
11.7k
        mclength = 1;
7401
11.7k
        if (op_previous <= OP_CHARI && repeat_min > 1)
7402
2.12k
          {
7403
2.12k
          reqcu = mcbuffer[0];
7404
2.12k
          reqcuflags = cb->req_varyopt;
7405
2.12k
          if (op_previous == OP_CHARI) reqcuflags |= REQ_CASELESS;
7406
2.12k
          }
7407
11.7k
        }
7408
11.7k
      goto OUTPUT_SINGLE_REPEAT;  /* Code shared with single character types */
7409
7410
      /* If previous was a character class or a back reference, we put the
7411
      repeat stuff after it, but just skip the item if the repeat was {0,0}. */
7412
7413
0
#ifdef SUPPORT_WIDE_CHARS
7414
0
      case OP_XCLASS:
7415
0
      case OP_ECLASS:
7416
0
#endif
7417
1.96k
      case OP_CLASS:
7418
2.43k
      case OP_NCLASS:
7419
2.45k
      case OP_REF:
7420
2.45k
      case OP_REFI:
7421
2.84k
      case OP_DNREF:
7422
3.17k
      case OP_DNREFI:
7423
7424
3.17k
      if (repeat_max == 0)
7425
0
        {
7426
0
        code = previous;
7427
0
        goto END_REPEAT;
7428
0
        }
7429
3.17k
      if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7430
7431
3.09k
      if (repeat_min == 0 && repeat_max == REPEAT_UNLIMITED)
7432
162
        *code++ = OP_CRSTAR + repeat_type;
7433
2.93k
      else if (repeat_min == 1 && repeat_max == REPEAT_UNLIMITED)
7434
1.10k
        *code++ = OP_CRPLUS + repeat_type;
7435
1.82k
      else if (repeat_min == 0 && repeat_max == 1)
7436
1.63k
        *code++ = OP_CRQUERY + repeat_type;
7437
194
      else
7438
194
        {
7439
194
        *code++ = OP_CRRANGE + repeat_type;
7440
194
        PUT2INC(code, 0, repeat_min);
7441
194
        if (repeat_max == REPEAT_UNLIMITED) repeat_max = 0;  /* 2-byte encoding for max */
7442
194
        PUT2INC(code, 0, repeat_max);
7443
194
        }
7444
3.09k
      break;
7445
7446
      /* Prior to 10.30, repeated recursions were wrapped in OP_ONCE brackets
7447
      because pcre2_match() could not handle backtracking into recursively
7448
      called groups. Now that this backtracking is available, we no longer need
7449
      to do this. However, we still need to replicate recursions as we do for
7450
      groups so as to have independent backtracking points. We can replicate
7451
      for the minimum number of repeats directly. For optional repeats we now
7452
      wrap the recursion in OP_BRA brackets and make use of the bracket
7453
      repetition. */
7454
7455
2.61k
      case OP_RECURSE:
7456
2.61k
      if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7457
2
        goto END_REPEAT;
7458
7459
      /* Generate unwrapped repeats for a non-zero minimum, except when the
7460
      minimum is 1 and the maximum unlimited, because that can be handled with
7461
      OP_BRA terminated by OP_KETRMAX/MIN. When the maximum is equal to the
7462
      minimum, we just need to generate the appropriate additional copies.
7463
      Otherwise we need to generate one more, to simulate the situation when
7464
      the minimum is zero. */
7465
7466
2.61k
      if (repeat_min > 0 && (repeat_min != 1 || repeat_max != REPEAT_UNLIMITED))
7467
287
        {
7468
287
        int replicate = repeat_min;
7469
7470
287
        if (repeat_min == repeat_max) replicate--;
7471
7472
        /* In the pre-compile phase, we don't actually do the replication. We
7473
        just adjust the length as if we had. Do some paranoid checks for
7474
        potential integer overflow. */
7475
7476
287
        if (lengthptr != NULL)
7477
145
          {
7478
145
          PCRE2_SIZE delta;
7479
145
          if (PRIV(ckd_smul)(&delta, replicate, (int)length_prevgroup) ||
7480
145
              OFLOW_MAX - *lengthptr < delta)
7481
0
            {
7482
0
            *errorcodeptr = ERR20;
7483
0
            return 0;
7484
0
            }
7485
145
          *lengthptr += delta;
7486
145
          }
7487
38.9k
        else for (i = 0; i < replicate; i++)
7488
38.8k
          {
7489
38.8k
          memcpy(code, previous, CU2BYTES(length_prevgroup));
7490
38.8k
          previous = code;
7491
38.8k
          code += length_prevgroup;
7492
38.8k
          }
7493
7494
        /* If the number of repeats is fixed, we are done. Otherwise, adjust
7495
        the counts and fall through. */
7496
7497
287
        if (repeat_min == repeat_max) break;
7498
39
        if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7499
39
        repeat_min = 0;
7500
39
        }
7501
7502
      /* Wrap the recursion call in OP_BRA brackets. */
7503
2.36k
        {
7504
2.36k
        PCRE2_SIZE length = (lengthptr != NULL) ? 1 + LINK_SIZE : length_prevgroup;
7505
7506
2.36k
        (void)memmove(previous + 1 + LINK_SIZE, previous, CU2BYTES(length));
7507
2.36k
        op_previous = *previous = OP_BRA;
7508
2.36k
        PUT(previous, 1, 1 + LINK_SIZE + length);
7509
2.36k
        previous[1 + LINK_SIZE + length] = OP_KET;
7510
2.36k
        PUT(previous, 2 + LINK_SIZE + length, 1 + LINK_SIZE + length);
7511
2.36k
        }
7512
2.36k
      code += 2 + 2 * LINK_SIZE;
7513
2.36k
      length_prevgroup += 2 + 2 * LINK_SIZE;
7514
2.36k
      group_return = -1;  /* Set "may match empty string" */
7515
7516
      /* Now treat as a repeated OP_BRA. */
7517
2.36k
      PCRE2_FALLTHROUGH /* Fall through */
7518
7519
      /* If previous was a bracket group, we may have to replicate it in
7520
      certain cases. Note that at this point we can encounter only the "basic"
7521
      bracket opcodes such as BRA and CBRA, as this is the place where they get
7522
      converted into the more special varieties such as BRAPOS and SBRA.
7523
      Originally, PCRE did not allow repetition of assertions, but now it does,
7524
      for Perl compatibility. */
7525
7526
2.39k
      case OP_ASSERT:
7527
3.59k
      case OP_ASSERT_NOT:
7528
3.59k
      case OP_ASSERT_NA:
7529
3.60k
      case OP_ASSERTBACK:
7530
3.61k
      case OP_ASSERTBACK_NOT:
7531
3.61k
      case OP_ASSERTBACK_NA:
7532
3.61k
      case OP_ASSERT_SCS:
7533
4.22k
      case OP_ONCE:
7534
4.22k
      case OP_SCRIPT_RUN:
7535
4.69k
      case OP_BRA:
7536
14.1k
      case OP_CBRA:
7537
15.3k
      case OP_COND:
7538
15.3k
        {
7539
15.3k
        int len = (int)(code - previous);
7540
15.3k
        PCRE2_UCHAR *bralink = NULL;
7541
15.3k
        PCRE2_UCHAR *brazeroptr = NULL;
7542
7543
15.3k
        if (repeat_max == 1 && repeat_min == 1 && !possessive_quantifier)
7544
57
          goto END_REPEAT;
7545
7546
        /* Repeating a DEFINE group (or any group where the condition is always
7547
        FALSE and there is only one branch) is pointless, but Perl allows the
7548
        syntax, so we just ignore the repeat. */
7549
7550
15.3k
        if (op_previous == OP_COND && previous[LINK_SIZE+1] == OP_FALSE &&
7551
54
            previous[GET(previous, 1)] != OP_ALT)
7552
54
          goto END_REPEAT;
7553
7554
        /* Perl allows all assertions to be quantified, and when they contain
7555
        capturing parentheses and/or are optional there are potential uses for
7556
        this feature. PCRE2 used to force the maximum quantifier to 1 on the
7557
        invalid grounds that further repetition was never useful. This was
7558
        always a bit pointless, since an assertion could be wrapped with a
7559
        repeated group to achieve the effect. General repetition is now
7560
        permitted, but if the maximum is unlimited it is set to one more than
7561
        the minimum. */
7562
7563
15.2k
        if (op_previous < OP_ONCE)    /* Assertion */
7564
1.24k
          {
7565
1.24k
          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;
7566
1.24k
          }
7567
7568
        /* The case of a zero minimum is special because of the need to stick
7569
        OP_BRAZERO in front of it, and because the group appears once in the
7570
        data, whereas in other cases it appears the minimum number of times. For
7571
        this reason, it is simplest to treat this case separately, as otherwise
7572
        the code gets far too messy. There are several special subcases when the
7573
        minimum is zero. */
7574
7575
15.2k
        if (repeat_min == 0)
7576
4.11k
          {
7577
          /* If the maximum is also zero, we used to just omit the group from
7578
          the output altogether, like this:
7579
7580
          ** if (repeat_max == 0)
7581
          **   {
7582
          **   code = previous;
7583
          **   goto END_REPEAT;
7584
          **   }
7585
7586
          However, that fails when a group or a subgroup within it is
7587
          referenced as a subroutine from elsewhere in the pattern, so now we
7588
          stick in OP_SKIPZERO in front of it so that it is skipped on
7589
          execution. As we don't have a list of which groups are referenced, we
7590
          cannot do this selectively.
7591
7592
          If the maximum is 1 or unlimited, we just have to stick in the
7593
          BRAZERO and do no more at this point. */
7594
7595
4.11k
          if (repeat_max <= 1 || repeat_max == REPEAT_UNLIMITED)
7596
3.74k
            {
7597
3.74k
            (void)memmove(previous + 1, previous, CU2BYTES(len));
7598
3.74k
            code++;
7599
3.74k
            if (repeat_max == 0)
7600
94
              {
7601
94
              *previous++ = OP_SKIPZERO;
7602
94
              goto END_REPEAT;
7603
94
              }
7604
3.65k
            brazeroptr = previous;    /* Save for possessive optimizing */
7605
3.65k
            *previous++ = OP_BRAZERO + repeat_type;
7606
3.65k
            }
7607
7608
          /* If the maximum is greater than 1 and limited, we have to replicate
7609
          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
7610
          The first one has to be handled carefully because it's the original
7611
          copy, which has to be moved up. The remainder can be handled by code
7612
          that is common with the non-zero minimum case below. We have to
7613
          adjust the value or repeat_max, since one less copy is required. */
7614
7615
370
          else
7616
370
            {
7617
370
            int linkoffset;
7618
370
            (void)memmove(previous + 2 + LINK_SIZE, previous, CU2BYTES(len));
7619
370
            code += 2 + LINK_SIZE;
7620
370
            *previous++ = OP_BRAZERO + repeat_type;
7621
370
            *previous++ = OP_BRA;
7622
7623
            /* We chain together the bracket link offset fields that have to be
7624
            filled in later when the ends of the brackets are reached. */
7625
7626
370
            linkoffset = (bralink == NULL)? 0 : (int)(previous - bralink);
7627
370
            bralink = previous;
7628
370
            PUTINC(previous, 0, linkoffset);
7629
370
            }
7630
7631
4.02k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max--;
7632
4.02k
          }
7633
7634
        /* If the minimum is greater than zero, replicate the group as many
7635
        times as necessary, and adjust the maximum to the number of subsequent
7636
        copies that we need. */
7637
7638
11.1k
        else
7639
11.1k
          {
7640
11.1k
          if (repeat_min > 1)
7641
5.86k
            {
7642
            /* In the pre-compile phase, we don't actually do the replication.
7643
            We just adjust the length as if we had. Do some paranoid checks for
7644
            potential integer overflow. */
7645
7646
5.86k
            if (lengthptr != NULL)
7647
3.08k
              {
7648
3.08k
              PCRE2_SIZE delta;
7649
3.08k
              if (PRIV(ckd_smul)(&delta, repeat_min - 1,
7650
3.08k
                                 (int)length_prevgroup) ||
7651
3.08k
                  OFLOW_MAX - *lengthptr < delta)
7652
0
                {
7653
0
                *errorcodeptr = ERR20;
7654
0
                return 0;
7655
0
                }
7656
3.08k
              *lengthptr += delta;
7657
3.08k
              }
7658
7659
            /* This is compiling for real. If there is a set first code unit
7660
            for the group, and we have not yet set a "required code unit", set
7661
            it. */
7662
7663
2.78k
            else
7664
2.78k
              {
7665
2.78k
              if (groupsetfirstcu && reqcuflags >= REQ_NONE)
7666
166
                {
7667
166
                reqcu = firstcu;
7668
166
                reqcuflags = firstcuflags;
7669
166
                }
7670
361k
              for (j = 1; j < repeat_min; j++)
7671
359k
                {
7672
359k
                memcpy(code, previous, CU2BYTES(len));
7673
359k
                code += len;
7674
359k
                }
7675
2.78k
              }
7676
5.86k
            }
7677
7678
11.1k
          if (repeat_max != REPEAT_UNLIMITED) repeat_max -= repeat_min;
7679
11.1k
          }
7680
7681
        /* This code is common to both the zero and non-zero minimum cases. If
7682
        the maximum is limited, it replicates the group in a nested fashion,
7683
        remembering the bracket starts on a stack. In the case of a zero
7684
        minimum, the first one was set up above. In all cases the repeat_max
7685
        now specifies the number of additional copies needed. Again, we must
7686
        remember to replicate entries on the forward reference list. */
7687
7688
15.1k
        if (repeat_max != REPEAT_UNLIMITED)
7689
8.76k
          {
7690
          /* In the pre-compile phase, we don't actually do the replication. We
7691
          just adjust the length as if we had. For each repetition we must add
7692
          1 to the length for BRAZERO and for all but the last repetition we
7693
          must add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
7694
          paranoid checks to avoid integer overflow. */
7695
7696
8.76k
          if (lengthptr != NULL && repeat_max > 0)
7697
950
            {
7698
950
            PCRE2_SIZE delta;
7699
950
            if (PRIV(ckd_smul)(&delta, repeat_max,
7700
950
                               (int)length_prevgroup + 1 + 2 + 2*LINK_SIZE) ||
7701
950
                OFLOW_MAX + (2 + 2*LINK_SIZE) - *lengthptr < delta)
7702
0
              {
7703
0
              *errorcodeptr = ERR20;
7704
0
              return 0;
7705
0
              }
7706
950
            delta -= (2 + 2*LINK_SIZE);   /* Last one doesn't nest */
7707
950
            *lengthptr += delta;
7708
950
            }
7709
7710
          /* This is compiling for real */
7711
7712
37.5k
          else for (i = repeat_max; i >= 1; i--)
7713
29.7k
            {
7714
29.7k
            *code++ = OP_BRAZERO + repeat_type;
7715
7716
            /* All but the final copy start a new nesting, maintaining the
7717
            chain of brackets outstanding. */
7718
7719
29.7k
            if (i != 1)
7720
28.9k
              {
7721
28.9k
              int linkoffset;
7722
28.9k
              *code++ = OP_BRA;
7723
28.9k
              linkoffset = (bralink == NULL)? 0 : (int)(code - bralink);
7724
28.9k
              bralink = code;
7725
28.9k
              PUTINC(code, 0, linkoffset);
7726
28.9k
              }
7727
7728
29.7k
            memcpy(code, previous, CU2BYTES(len));
7729
29.7k
            code += len;
7730
29.7k
            }
7731
7732
          /* Now chain through the pending brackets, and fill in their length
7733
          fields (which are holding the chain links pro tem). */
7734
7735
38.0k
          while (bralink != NULL)
7736
29.2k
            {
7737
29.2k
            int oldlinkoffset;
7738
29.2k
            int linkoffset = (int)(code - bralink + 1);
7739
29.2k
            PCRE2_UCHAR *bra = code - linkoffset;
7740
29.2k
            oldlinkoffset = GET(bra, 1);
7741
29.2k
            bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
7742
29.2k
            *code++ = OP_KET;
7743
29.2k
            PUTINC(code, 0, linkoffset);
7744
29.2k
            PUT(bra, 1, linkoffset);
7745
29.2k
            }
7746
8.76k
          }
7747
7748
        /* If the maximum is unlimited, set a repeater in the final copy. For
7749
        SCRIPT_RUN and ONCE brackets, that's all we need to do. However,
7750
        possessively repeated ONCE brackets can be converted into non-capturing
7751
        brackets, as the behaviour of (?:xx)++ is the same as (?>xx)++ and this
7752
        saves having to deal with possessive ONCEs specially.
7753
7754
        Otherwise, when we are doing the actual compile phase, check to see
7755
        whether this group is one that could match an empty string. If so,
7756
        convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
7757
        that runtime checking can be done. [This check is also applied to ONCE
7758
        and SCRIPT_RUN groups at runtime, but in a different way.]
7759
7760
        Then, if the quantifier was possessive and the bracket is not a
7761
        conditional, we convert the BRA code to the POS form, and the KET code
7762
        to KETRPOS. (It turns out to be convenient at runtime to detect this
7763
        kind of subpattern at both the start and at the end.) The use of
7764
        special opcodes makes it possible to reduce greatly the stack usage in
7765
        pcre2_match(). If the group is preceded by OP_BRAZERO, convert this to
7766
        OP_BRAPOSZERO.
7767
7768
        Then, if the minimum number of matches is 1 or 0, cancel the possessive
7769
        flag so that the default action below, of wrapping everything inside
7770
        atomic brackets, does not happen. When the minimum is greater than 1,
7771
        there will be earlier copies of the group, and so we still have to wrap
7772
        the whole thing. */
7773
7774
6.39k
        else
7775
6.39k
          {
7776
6.39k
          PCRE2_UCHAR *ketcode = code - 1 - LINK_SIZE;
7777
6.39k
          PCRE2_UCHAR *bracode = ketcode - GET(ketcode, 1);
7778
7779
          /* Convert possessive ONCE brackets to non-capturing */
7780
7781
6.39k
          if (*bracode == OP_ONCE && possessive_quantifier) *bracode = OP_BRA;
7782
7783
          /* For non-possessive ONCE and for SCRIPT_RUN brackets, all we need
7784
          to do is to set the KET. */
7785
7786
6.39k
          if (*bracode == OP_ONCE || *bracode == OP_SCRIPT_RUN)
7787
220
            *ketcode = OP_KETRMAX + repeat_type;
7788
7789
          /* Handle non-SCRIPT_RUN and non-ONCE brackets and possessive ONCEs
7790
          (which have been converted to non-capturing above). */
7791
7792
6.17k
          else
7793
6.17k
            {
7794
            /* In the compile phase, adjust the opcode if the group can match
7795
            an empty string. For a conditional group with only one branch, the
7796
            value of group_return will not show "could be empty", so we must
7797
            check that separately. */
7798
7799
6.17k
            if (lengthptr == NULL)
7800
3.00k
              {
7801
3.00k
              if (group_return < 0) *bracode += OP_SBRA - OP_BRA;
7802
3.00k
              if (*bracode == OP_COND && bracode[GET(bracode,1)] != OP_ALT)
7803
17
                *bracode = OP_SCOND;
7804
3.00k
              }
7805
7806
            /* Handle possessive quantifiers. */
7807
7808
6.17k
            if (possessive_quantifier)
7809
2.84k
              {
7810
              /* For COND brackets, we wrap the whole thing in a possessively
7811
              repeated non-capturing bracket, because we have not invented POS
7812
              versions of the COND opcodes. */
7813
7814
2.84k
              if (*bracode == OP_COND || *bracode == OP_SCOND)
7815
646
                {
7816
646
                int nlen = (int)(code - bracode);
7817
646
                (void)memmove(bracode + 1 + LINK_SIZE, bracode, CU2BYTES(nlen));
7818
646
                code += 1 + LINK_SIZE;
7819
646
                nlen += 1 + LINK_SIZE;
7820
646
                *bracode = (*bracode == OP_COND)? OP_BRAPOS : OP_SBRAPOS;
7821
646
                *code++ = OP_KETRPOS;
7822
646
                PUTINC(code, 0, nlen);
7823
646
                PUT(bracode, 1, nlen);
7824
646
                }
7825
7826
              /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
7827
7828
2.19k
              else
7829
2.19k
                {
7830
2.19k
                *bracode += 1;              /* Switch to xxxPOS opcodes */
7831
2.19k
                *ketcode = OP_KETRPOS;
7832
2.19k
                }
7833
7834
              /* If the minimum is zero, mark it as possessive, then unset the
7835
              possessive flag when the minimum is 0 or 1. */
7836
7837
2.84k
              if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
7838
2.84k
              if (repeat_min < 2) possessive_quantifier = FALSE;
7839
2.84k
              }
7840
7841
            /* Non-possessive quantifier */
7842
7843
3.33k
            else *ketcode = OP_KETRMAX + repeat_type;
7844
6.17k
            }
7845
6.39k
          }
7846
15.1k
        }
7847
15.1k
      break;
7848
7849
      /* If previous was a character type match (\d or similar), abolish it and
7850
      create a suitable repeat item. The code is shared with single-character
7851
      repeats by setting op_type to add a suitable offset into repeat_type.
7852
      Note the the Unicode property types will be present only when
7853
      SUPPORT_UNICODE is defined, but we don't wrap the little bits of code
7854
      here because it just makes it horribly messy. */
7855
7856
15.1k
      default:
7857
7858
      /* LCOV_EXCL_START */
7859
6.30k
      if (op_previous >= OP_EODN || op_previous <= OP_WORD_BOUNDARY)
7860
0
        {
7861
0
        PCRE2_DEBUG_UNREACHABLE();
7862
0
        *errorcodeptr = ERR10;  /* Not a character type - internal error */
7863
0
        return 0;
7864
0
        }
7865
      /* LCOV_EXCL_STOP */
7866
7867
6.30k
        {
7868
6.30k
        int prop_type, prop_value;
7869
6.30k
        PCRE2_UCHAR *oldcode;
7870
7871
6.30k
        if (repeat_max == 1 && repeat_min == 1) goto END_REPEAT;
7872
7873
6.24k
        op_type = OP_TYPESTAR - OP_STAR;      /* Use type opcodes */
7874
6.24k
        mclength = 0;                         /* Not a character */
7875
7876
6.24k
        if (op_previous == OP_PROP || op_previous == OP_NOTPROP)
7877
0
          {
7878
0
          prop_type = previous[1];
7879
0
          prop_value = previous[2];
7880
0
          }
7881
6.24k
        else
7882
6.24k
          {
7883
          /* Come here from just above with a character in mcbuffer/mclength.
7884
          You must also set op_type before the jump. */
7885
17.9k
          OUTPUT_SINGLE_REPEAT:
7886
17.9k
          prop_type = prop_value = -1;
7887
17.9k
          }
7888
7889
        /* At this point, if prop_type == prop_value == -1 we either have a
7890
        character in mcbuffer when mclength is greater than zero, or we have
7891
        mclength zero, in which case there is a non-property character type in
7892
        op_previous. If prop_type/value are not negative, we have a property
7893
        character type in op_previous. */
7894
7895
17.9k
        oldcode = code;                   /* Save where we were */
7896
17.9k
        code = previous;                  /* Usually overwrite previous item */
7897
7898
        /* If the maximum is zero then the minimum must also be zero; Perl allows
7899
        this case, so we do too - by simply omitting the item altogether. */
7900
7901
17.9k
        if (repeat_max == 0) goto END_REPEAT;
7902
7903
        /* Combine the op_type with the repeat_type */
7904
7905
17.4k
        repeat_type += op_type;
7906
7907
        /* A minimum of zero is handled either as the special case * or ?, or as
7908
        an UPTO, with the maximum given. */
7909
7910
17.4k
        if (repeat_min == 0)
7911
8.63k
          {
7912
8.63k
          if (repeat_max == REPEAT_UNLIMITED) *code++ = OP_STAR + repeat_type;
7913
3.00k
            else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
7914
372
          else
7915
372
            {
7916
372
            *code++ = OP_UPTO + repeat_type;
7917
372
            PUT2INC(code, 0, repeat_max);
7918
372
            }
7919
8.63k
          }
7920
7921
        /* A repeat minimum of 1 is optimized into some special cases. If the
7922
        maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
7923
        left in place and, if the maximum is greater than 1, we use OP_UPTO with
7924
        one less than the maximum. */
7925
7926
8.76k
        else if (repeat_min == 1)
7927
5.76k
          {
7928
5.76k
          if (repeat_max == REPEAT_UNLIMITED)
7929
4.94k
            *code++ = OP_PLUS + repeat_type;
7930
828
          else
7931
828
            {
7932
828
            code = oldcode;  /* Leave previous item in place */
7933
828
            if (repeat_max == 1) goto END_REPEAT;
7934
828
            *code++ = OP_UPTO + repeat_type;
7935
828
            PUT2INC(code, 0, repeat_max - 1);
7936
828
            }
7937
5.76k
          }
7938
7939
        /* The case {n,n} is just an EXACT, while the general case {n,m} is
7940
        handled as an EXACT followed by an UPTO or STAR or QUERY. */
7941
7942
2.99k
        else
7943
2.99k
          {
7944
2.99k
          *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
7945
2.99k
          PUT2INC(code, 0, repeat_min);
7946
7947
          /* Unless repeat_max equals repeat_min, fill in the data for EXACT,
7948
          and then generate the second opcode. For a repeated Unicode property
7949
          match, there are two extra values that define the required property,
7950
          and mclength is set zero to indicate this. */
7951
7952
2.99k
          if (repeat_max != repeat_min)
7953
717
            {
7954
717
            if (mclength > 0)
7955
656
              {
7956
656
              memcpy(code, mcbuffer, CU2BYTES(mclength));
7957
656
              code += mclength;
7958
656
              }
7959
61
            else
7960
61
              {
7961
61
              *code++ = op_previous;
7962
61
              if (prop_type >= 0)
7963
0
                {
7964
0
                *code++ = prop_type;
7965
0
                *code++ = prop_value;
7966
0
                }
7967
61
              }
7968
7969
            /* Now set up the following opcode */
7970
7971
717
            if (repeat_max == REPEAT_UNLIMITED)
7972
240
              *code++ = OP_STAR + repeat_type;
7973
477
            else
7974
477
              {
7975
477
              repeat_max -= repeat_min;
7976
477
              if (repeat_max == 1)
7977
193
                {
7978
193
                *code++ = OP_QUERY + repeat_type;
7979
193
                }
7980
284
              else
7981
284
                {
7982
284
                *code++ = OP_UPTO + repeat_type;
7983
284
                PUT2INC(code, 0, repeat_max);
7984
284
                }
7985
477
              }
7986
717
            }
7987
2.99k
          }
7988
7989
        /* Fill in the character or character type for the final opcode. */
7990
7991
17.4k
        if (mclength > 0)
7992
11.1k
          {
7993
11.1k
          memcpy(code, mcbuffer, CU2BYTES(mclength));
7994
11.1k
          code += mclength;
7995
11.1k
          }
7996
6.21k
        else
7997
6.21k
          {
7998
6.21k
          *code++ = op_previous;
7999
6.21k
          if (prop_type >= 0)
8000
0
            {
8001
0
            *code++ = prop_type;
8002
0
            *code++ = prop_value;
8003
0
            }
8004
6.21k
          }
8005
17.4k
        }
8006
0
      break;
8007
37.0k
      }  /* End of switch on different op_previous values */
8008
8009
8010
    /* If the character following a repeat is '+', possessive_quantifier is
8011
    TRUE. For some opcodes, there are special alternative opcodes for this
8012
    case. For anything else, we wrap the entire repeated item inside OP_ONCE
8013
    brackets. Logically, the '+' notation is just syntactic sugar, taken from
8014
    Sun's Java package, but the special opcodes can optimize it.
8015
8016
    Some (but not all) possessively repeated subpatterns have already been
8017
    completely handled in the code just above. For them, possessive_quantifier
8018
    is always FALSE at this stage. Note that the repeated item starts at
8019
    tempcode, not at previous, which might be the first part of a string whose
8020
    (former) last char we repeated. */
8021
8022
35.9k
    if (possessive_quantifier)
8023
3.82k
      {
8024
3.82k
      int len;
8025
8026
      /* Possessifying an EXACT quantifier has no effect, so we can ignore it.
8027
      However, QUERY, STAR, or UPTO may follow (for quantifiers such as {5,6},
8028
      {5,}, or {5,10}). We skip over an EXACT item; if the length of what
8029
      remains is greater than zero, there's a further opcode that can be
8030
      handled. If not, do nothing, leaving the EXACT alone. */
8031
8032
3.82k
      switch(*tempcode)
8033
3.82k
        {
8034
205
        case OP_TYPEEXACT:
8035
205
        tempcode += PRIV(OP_lengths)[*tempcode] +
8036
205
          ((tempcode[1 + IMM2_SIZE] == OP_PROP
8037
205
          || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
8038
205
        break;
8039
8040
        /* CHAR opcodes are used for exacts whose count is 1. */
8041
8042
74
        case OP_CHAR:
8043
266
        case OP_CHARI:
8044
458
        case OP_NOT:
8045
688
        case OP_NOTI:
8046
909
        case OP_EXACT:
8047
1.11k
        case OP_EXACTI:
8048
1.18k
        case OP_NOTEXACT:
8049
1.37k
        case OP_NOTEXACTI:
8050
1.37k
        tempcode += PRIV(OP_lengths)[*tempcode];
8051
1.37k
#ifdef SUPPORT_UNICODE
8052
1.37k
        if (utf && HAS_EXTRALEN(tempcode[-1]))
8053
0
          tempcode += GET_EXTRALEN(tempcode[-1]);
8054
1.37k
#endif
8055
1.37k
        break;
8056
8057
        /* For the class opcodes, the repeat operator appears at the end;
8058
        adjust tempcode to point to it. */
8059
8060
411
        case OP_CLASS:
8061
623
        case OP_NCLASS:
8062
623
        tempcode += 1 + 32/sizeof(PCRE2_UCHAR);
8063
623
        break;
8064
8065
0
#ifdef SUPPORT_WIDE_CHARS
8066
0
        case OP_XCLASS:
8067
0
        case OP_ECLASS:
8068
0
        tempcode += GET(tempcode, 1);
8069
0
        break;
8070
3.82k
#endif
8071
3.82k
        }
8072
8073
      /* If tempcode is equal to code (which points to the end of the repeated
8074
      item), it means we have skipped an EXACT item but there is no following
8075
      QUERY, STAR, or UPTO; the value of len will be 0, and we do nothing. In
8076
      all other cases, tempcode will be pointing to the repeat opcode, and will
8077
      be less than code, so the value of len will be greater than 0. */
8078
8079
3.82k
      len = (int)(code - tempcode);
8080
3.82k
      if (len > 0)
8081
2.95k
        {
8082
2.95k
        unsigned int repcode = *tempcode;
8083
8084
        /* There is a table for possessifying opcodes, all of which are less
8085
        than OP_CALLOUT. A zero entry means there is no possessified version.
8086
        */
8087
8088
2.95k
        if (repcode < OP_CALLOUT && opcode_possessify[repcode] > 0)
8089
1.85k
          *tempcode = opcode_possessify[repcode];
8090
8091
        /* For opcode without a special possessified version, wrap the item in
8092
        ONCE brackets. */
8093
8094
1.09k
        else
8095
1.09k
          {
8096
1.09k
          (void)memmove(tempcode + 1 + LINK_SIZE, tempcode, CU2BYTES(len));
8097
1.09k
          code += 1 + LINK_SIZE;
8098
1.09k
          len += 1 + LINK_SIZE;
8099
1.09k
          tempcode[0] = OP_ONCE;
8100
1.09k
          *code++ = OP_KET;
8101
1.09k
          PUTINC(code, 0, len);
8102
1.09k
          PUT(tempcode, 1, len);
8103
1.09k
          }
8104
2.95k
        }
8105
3.82k
      }
8106
8107
    /* We set the "follows varying string" flag for subsequently encountered
8108
    reqcus if it isn't already set and we have just passed a varying length
8109
    item. */
8110
8111
37.0k
    END_REPEAT:
8112
37.0k
    cb->req_varyopt |= reqvary;
8113
37.0k
    break;
8114
8115
8116
    /* ===================================================================*/
8117
    /* Handle a 32-bit data character with a value greater than META_END. */
8118
8119
0
    case META_BIGVALUE:
8120
0
    pptr++;
8121
0
    goto NORMAL_CHAR;
8122
8123
8124
    /* ===============================================================*/
8125
    /* Handle a back reference by number, which is the meta argument. The
8126
    pattern offsets for back references to group numbers less than 10 are held
8127
    in a special vector, to avoid using more than two parsed pattern elements
8128
    in 64-bit environments. We only need the offset to the first occurrence,
8129
    because if that doesn't fail, subsequent ones will also be OK. */
8130
8131
775
    case META_BACKREF:
8132
775
    if (meta_arg < 10) offset = cb->small_ref_offset[meta_arg];
8133
179
      else GETPLUSOFFSET(offset, pptr);
8134
8135
775
    if (meta_arg > cb->bracount)
8136
289
      {
8137
289
      cb->erroroffset = offset;
8138
289
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8139
289
      return 0;
8140
289
      }
8141
8142
    /* Come here from named backref handling when the reference is to a
8143
    single group (that is, not to a duplicated name). The back reference
8144
    data will have already been updated. We must disable firstcu if not
8145
    set, to cope with cases like (?=(\w+))\1: which would otherwise set ':'
8146
    later. */
8147
8148
885
    HANDLE_SINGLE_REFERENCE:
8149
885
    if (firstcuflags == REQ_UNSET) zerofirstcuflags = firstcuflags = REQ_NONE;
8150
885
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_REFI : OP_REF;
8151
885
    PUT2INC(code, 0, meta_arg);
8152
885
    if ((options & PCRE2_CASELESS) != 0)
8153
0
      *code++ = (((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)?
8154
0
                 REFI_FLAG_CASELESS_RESTRICT : 0) |
8155
0
                (((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)?
8156
0
                 REFI_FLAG_TURKISH_CASING : 0);
8157
8158
    /* Update the map of back references, and keep the highest one. We
8159
    could do this in parse_regex() for numerical back references, but not
8160
    for named back references, because we don't know the numbers to which
8161
    named back references refer. So we do it all in this function. */
8162
8163
885
    cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;
8164
885
    if (meta_arg > cb->top_backref) cb->top_backref = meta_arg;
8165
885
    break;
8166
8167
8168
    /* ===============================================================*/
8169
    /* Handle recursion by inserting the number of the called group (which is
8170
    the meta argument) after OP_RECURSE. At the end of compiling the pattern is
8171
    scanned and these numbers are replaced by offsets within the pattern. It is
8172
    done like this to avoid problems with forward references and adjusting
8173
    offsets when groups are duplicated and moved (as discovered in previous
8174
    implementations). Note that a recursion does not have a set first
8175
    character. */
8176
8177
10.6k
    case META_RECURSE:
8178
10.6k
    GETPLUSOFFSET(offset, pptr);
8179
10.6k
    if (meta_arg > cb->bracount)
8180
271
      {
8181
271
      cb->erroroffset = offset;
8182
271
      *errorcodeptr = ERR15;  /* Non-existent subpattern */
8183
271
      return 0;
8184
271
      }
8185
10.5k
    HANDLE_NUMERICAL_RECURSION:
8186
10.5k
    *code = OP_RECURSE;
8187
10.5k
    PUT(code, 1, meta_arg);
8188
10.5k
    code += 1 + LINK_SIZE;
8189
    /* Repeat processing requires this information to
8190
    determine the real length in pre-compile phase. */
8191
10.5k
    length_prevgroup = 1 + LINK_SIZE;
8192
8193
10.5k
    if (META_CODE(pptr[1]) == META_OFFSET ||
8194
10.5k
        META_CODE(pptr[1]) == META_CAPTURE_NAME ||
8195
10.5k
        META_CODE(pptr[1]) == META_CAPTURE_NUMBER)
8196
0
      {
8197
0
      recurse_arguments *args;
8198
8199
0
      if (lengthptr != NULL)
8200
0
        {
8201
0
        if (!PRIV(compile_parse_recurse_args)(pptr, offset, errorcodeptr, cb))
8202
0
          return 0;
8203
8204
0
        args = (recurse_arguments*)cb->last_data;
8205
0
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8206
0
        *lengthptr += (args->size * (1 + IMM2_SIZE));
8207
0
        pptr += args->skip_size;
8208
0
        }
8209
0
      else
8210
0
        {
8211
0
        uint16_t *current, *end;
8212
8213
0
        args = (recurse_arguments*)cb->first_data;
8214
0
        PCRE2_ASSERT(args != NULL && args->header.type == CDATA_RECURSE_ARGS);
8215
8216
0
        current = (uint16_t*)(args + 1);
8217
0
        end = current + args->size;
8218
0
        PCRE2_ASSERT(end > current);
8219
8220
0
        do
8221
0
          {
8222
0
          code[0] = OP_CREF;
8223
0
          PUT2(code, 1, *current);
8224
0
          code += 1 + IMM2_SIZE;
8225
0
          }
8226
0
        while (++current < end);
8227
8228
0
        length_prevgroup += (args->size * (1 + IMM2_SIZE));
8229
0
        pptr += args->skip_size;
8230
0
        cb->first_data = args->header.next;
8231
0
        cb->cx->memctl.free(args, cb->cx->memctl.memory_data);
8232
0
        }
8233
0
      }
8234
8235
10.5k
    groupsetfirstcu = FALSE;
8236
10.5k
    cb->had_recurse = TRUE;
8237
10.5k
    if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8238
10.5k
    zerofirstcu = firstcu;
8239
10.5k
    zerofirstcuflags = firstcuflags;
8240
10.5k
    break;
8241
8242
8243
    /* ===============================================================*/
8244
    /* Handle capturing parentheses; the number is the meta argument. */
8245
8246
46.6k
    case META_CAPTURE:
8247
46.6k
    bravalue = OP_CBRA;
8248
46.6k
    skipunits = IMM2_SIZE;
8249
46.6k
    PUT2(code, 1+LINK_SIZE, meta_arg);
8250
46.6k
    cb->lastcapture = meta_arg;
8251
46.6k
    goto GROUP_PROCESS_NOTE_EMPTY;
8252
8253
8254
    /* ===============================================================*/
8255
    /* Handle escape sequence items. For ones like \d, the ESC_values are
8256
    arranged to be the same as the corresponding OP_values in the default case
8257
    when PCRE2_UCP is not set (which is the only case in which they will appear
8258
    here).
8259
8260
    Note: \Q and \E are never seen here, as they were dealt with in
8261
    parse_pattern(). Neither are numerical back references or recursions, which
8262
    were turned into META_BACKREF or META_RECURSE items, respectively. \k and
8263
    \g, when followed by names, are turned into META_BACKREF_BYNAME or
8264
    META_RECURSE_BYNAME. */
8265
8266
7.06k
    case META_ESCAPE:
8267
8268
    /* We can test for escape sequences that consume a character because their
8269
    values lie between ESC_b and ESC_Z; this may have to change if any new ones
8270
    are ever created. For these sequences, we disable the setting of a first
8271
    character if it hasn't already been set. */
8272
8273
7.06k
    if (meta_arg > ESC_b && meta_arg < ESC_Z)
8274
4.38k
      {
8275
4.38k
      matched_char = TRUE;
8276
4.38k
      if (firstcuflags == REQ_UNSET) firstcuflags = REQ_NONE;
8277
4.38k
      }
8278
8279
    /* Set values to reset to if this is followed by a zero repeat. */
8280
8281
7.06k
    zerofirstcu = firstcu;
8282
7.06k
    zerofirstcuflags = firstcuflags;
8283
7.06k
    zeroreqcu = reqcu;
8284
7.06k
    zeroreqcuflags = reqcuflags;
8285
8286
    /* If Unicode is not supported, \P and \p are not allowed and are
8287
    faulted at parse time, so will never appear here. */
8288
8289
7.06k
#ifdef SUPPORT_UNICODE
8290
7.06k
    if (meta_arg == ESC_P || meta_arg == ESC_p)
8291
0
      {
8292
0
      uint32_t ptype = *(++pptr) >> 16;
8293
0
      uint32_t pdata = *pptr & 0xffff;
8294
8295
      /* In caseless matching, particular characteristics Lu, Ll, and Lt get
8296
      converted to the general characteristic L&. That is, upper, lower, and
8297
      title case letters are all conflated. */
8298
8299
0
      if ((options & PCRE2_CASELESS) != 0 && ptype == PT_PC &&
8300
0
          (pdata == ucp_Lu || pdata == ucp_Ll || pdata == ucp_Lt))
8301
0
        {
8302
0
        ptype = PT_LAMP;
8303
0
        pdata = 0;
8304
0
        }
8305
8306
      /* The special case of \p{Any} is compiled to OP_ALLANY and \P{Any}
8307
      is compiled to [] so as to benefit from the auto-anchoring code. */
8308
8309
0
      if (ptype == PT_ANY)
8310
0
        {
8311
0
        if (meta_arg == ESC_P)
8312
0
          {
8313
0
          *code++ = OP_CLASS;
8314
0
          memset(code, 0, 32);
8315
0
          code += 32 / sizeof(PCRE2_UCHAR);
8316
0
          }
8317
0
        else
8318
0
          *code++ = OP_ALLANY;
8319
0
        }
8320
0
      else
8321
0
        {
8322
0
        *code++ = (meta_arg == ESC_p)? OP_PROP : OP_NOTPROP;
8323
0
        *code++ = ptype;
8324
0
        *code++ = pdata;
8325
0
        }
8326
0
      break;  /* End META_ESCAPE */
8327
0
      }
8328
7.06k
#endif
8329
8330
    /* \K is forbidden in lookarounds since 10.38 because that's what Perl has
8331
    done. However, there's an option, in case anyone was relying on it. */
8332
8333
7.06k
    if (cb->assert_depth > 0 && meta_arg == ESC_K &&
8334
8
        (xoptions & PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK) == 0)
8335
8
      {
8336
8
      *errorcodeptr = ERR99;
8337
8
      return 0;
8338
8
      }
8339
8340
    /* For the rest (including \X when Unicode is supported - if not it's
8341
    faulted at parse time), the OP value is the escape value when PCRE2_UCP is
8342
    not set; if it is set, most of them do not show up here because they are
8343
    converted into Unicode property tests in parse_regex().
8344
8345
    In non-UTF mode, and for both 32-bit modes, we turn \C into OP_ALLANY
8346
    instead of OP_ANYBYTE so that it works in DFA mode and in lookbehinds.
8347
    There are special UCP codes for \B and \b which are used in UCP mode unless
8348
    "word" matching is being forced to ASCII.
8349
8350
    Note that \b and \B do a one-character lookbehind, and \A also behaves as
8351
    if it does. */
8352
8353
7.05k
    switch(meta_arg)
8354
7.05k
      {
8355
1.57k
      case ESC_C:
8356
1.57k
      cb->external_flags |= PCRE2_HASBKC;  /* Record */
8357
#if PCRE2_CODE_UNIT_WIDTH == 32
8358
      meta_arg = OP_ALLANY;
8359
      (void)utf; /* Avoid compiler warning. */
8360
#else
8361
1.57k
      if (!utf) meta_arg = OP_ALLANY;
8362
1.57k
#endif
8363
1.57k
      break;
8364
8365
348
      case ESC_B:
8366
1.09k
      case ESC_b:
8367
1.09k
      if ((options & PCRE2_UCP) != 0 && (xoptions & PCRE2_EXTRA_ASCII_BSW) == 0)
8368
0
        meta_arg = (meta_arg == ESC_B)? OP_NOT_UCP_WORD_BOUNDARY :
8369
0
          OP_UCP_WORD_BOUNDARY;
8370
1.09k
      PCRE2_FALLTHROUGH /* Fall through */
8371
8372
1.73k
      case ESC_A:
8373
1.73k
      if (cb->max_lookbehind == 0) cb->max_lookbehind = 1;
8374
1.73k
      break;
8375
8376
40
      case ESC_K:
8377
40
      cb->external_flags |= PCRE2_HASBSK;  /* Record */
8378
40
      break;
8379
7.05k
      }
8380
8381
7.05k
    *code++ = meta_arg;
8382
7.05k
    break;  /* End META_ESCAPE */
8383
8384
8385
    /* ===================================================================*/
8386
    /* Handle an unrecognized meta value. A parsed pattern value less than
8387
    META_END is a literal. Otherwise we have a problem. */
8388
8389
108k
    default:
8390
    /* LCOV_EXCL_START */
8391
108k
    if (meta >= META_END)
8392
0
      {
8393
0
      PCRE2_DEBUG_UNREACHABLE();
8394
0
      *errorcodeptr = ERR89;  /* Internal error - unrecognized. */
8395
0
      return 0;
8396
0
      }
8397
    /* LCOV_EXCL_STOP */
8398
8399
    /* Handle a literal character. We come here by goto in the case of a
8400
    32-bit, non-UTF character whose value is greater than META_END. */
8401
8402
108k
    NORMAL_CHAR:
8403
108k
    meta = *pptr;     /* Get the full 32 bits */
8404
109k
    NORMAL_CHAR_SET:  /* Character is already in meta */
8405
109k
    matched_char = TRUE;
8406
8407
    /* For caseless UTF or UCP mode, check whether this character has more than
8408
    one other case. If so, generate a special OP_PROP item instead of OP_CHARI.
8409
    When casing restrictions apply, ignore caseless sets that start with an
8410
    ASCII character. If the character is affected by the special Turkish rules,
8411
    hardcode the matching characters using a caseset. */
8412
8413
109k
#ifdef SUPPORT_UNICODE
8414
109k
    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)
8415
0
      {
8416
0
      uint32_t caseset;
8417
8418
0
      if ((xoptions & (PCRE2_EXTRA_TURKISH_CASING|PCRE2_EXTRA_CASELESS_RESTRICT)) ==
8419
0
            PCRE2_EXTRA_TURKISH_CASING &&
8420
0
          UCD_ANY_I(meta))
8421
0
        {
8422
0
        caseset = PRIV(ucd_turkish_dotted_i_caseset) + (UCD_DOTTED_I(meta)? 0 : 3);
8423
0
        }
8424
0
      else if ((caseset = UCD_CASESET(meta)) != 0 &&
8425
0
               (xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0 &&
8426
0
               PRIV(ucd_caseless_sets)[caseset] < 128)
8427
0
        {
8428
0
        caseset = 0;  /* Ignore the caseless set if it's restricted. */
8429
0
        }
8430
8431
0
      if (caseset != 0)
8432
0
        {
8433
0
        *code++ = OP_PROP;
8434
0
        *code++ = PT_CLIST;
8435
0
        *code++ = caseset;
8436
0
        if (firstcuflags == REQ_UNSET)
8437
0
          firstcuflags = zerofirstcuflags = REQ_NONE;
8438
0
        break;  /* End handling this meta item */
8439
0
        }
8440
0
      }
8441
109k
#endif
8442
8443
    /* Caseful matches, or caseless and not one of the multicase characters. We
8444
    come here by goto in the case of a positive class that contains only
8445
    case-partners of a character with just two cases; matched_char has already
8446
    been set TRUE and options fudged if necessary. */
8447
8448
109k
    CLASS_CASELESS_CHAR:
8449
8450
    /* Get the character's code units into mcbuffer, with the length in
8451
    mclength. When not in UTF mode, the length is always 1. */
8452
8453
109k
#ifdef SUPPORT_UNICODE
8454
109k
    if (utf) mclength = PRIV(ord2utf)(meta, mcbuffer); else
8455
109k
#endif
8456
109k
      {
8457
109k
      mclength = 1;
8458
109k
      mcbuffer[0] = meta;
8459
109k
      }
8460
8461
    /* Generate the appropriate code */
8462
8463
109k
    *code++ = ((options & PCRE2_CASELESS) != 0)? OP_CHARI : OP_CHAR;
8464
109k
    memcpy(code, mcbuffer, CU2BYTES(mclength));
8465
109k
    code += mclength;
8466
8467
    /* Remember if \r or \n were seen */
8468
8469
109k
    if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
8470
1.26k
      cb->external_flags |= PCRE2_HASCRORLF;
8471
8472
    /* Set the first and required code units appropriately. If no previous
8473
    first code unit, set it from this character, but revert to none on a zero
8474
    repeat. Otherwise, leave the firstcu value alone, and don't change it on
8475
    a zero repeat. */
8476
8477
109k
    if (firstcuflags == REQ_UNSET)
8478
22.9k
      {
8479
22.9k
      zerofirstcuflags = REQ_NONE;
8480
22.9k
      zeroreqcu = reqcu;
8481
22.9k
      zeroreqcuflags = reqcuflags;
8482
8483
      /* If the character is more than one code unit long, we can set a single
8484
      firstcu only if it is not to be matched caselessly. Multiple possible
8485
      starting code units may be picked up later in the studying code. */
8486
8487
22.9k
      if (mclength == 1 || req_caseopt == 0)
8488
22.9k
        {
8489
22.9k
        firstcu = mcbuffer[0];
8490
22.9k
        firstcuflags = req_caseopt;
8491
22.9k
        if (mclength != 1)
8492
0
          {
8493
0
          reqcu = code[-1];
8494
0
          reqcuflags = cb->req_varyopt;
8495
0
          }
8496
22.9k
        }
8497
0
      else firstcuflags = reqcuflags = REQ_NONE;
8498
22.9k
      }
8499
8500
    /* firstcu was previously set; we can set reqcu only if the length is
8501
    1 or the matching is caseful. */
8502
8503
86.8k
    else
8504
86.8k
      {
8505
86.8k
      zerofirstcu = firstcu;
8506
86.8k
      zerofirstcuflags = firstcuflags;
8507
86.8k
      zeroreqcu = reqcu;
8508
86.8k
      zeroreqcuflags = reqcuflags;
8509
86.8k
      if (mclength == 1 || req_caseopt == 0)
8510
86.8k
        {
8511
86.8k
        reqcu = code[-1];
8512
86.8k
        reqcuflags = req_caseopt | cb->req_varyopt;
8513
86.8k
        }
8514
86.8k
      }
8515
8516
    /* If caselessness was temporarily instated, reset it. */
8517
8518
109k
    if (reset_caseful)
8519
2
      {
8520
2
      options &= ~PCRE2_CASELESS;
8521
2
      req_caseopt = 0;
8522
2
      reset_caseful = FALSE;
8523
2
      }
8524
8525
109k
    break;    /* End literal character handling */
8526
360k
    }         /* End of big switch */
8527
360k
  }           /* End of big loop */
8528
8529
/* LCOV_EXCL_START */
8530
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8531
0
return 0;                  /* Avoid compiler warnings */
8532
/* LCOV_EXCL_STOP */
8533
101k
}
8534
8535
8536
8537
/*************************************************
8538
*   Compile regex: a sequence of alternatives    *
8539
*************************************************/
8540
8541
/* On entry, pptr is pointing past the bracket meta, but on return it points to
8542
the closing bracket or META_END. The code variable is pointing at the code unit
8543
into which the BRA operator has been stored. This function is used during the
8544
pre-compile phase when we are trying to find out the amount of memory needed,
8545
as well as during the real compile phase. The value of lengthptr distinguishes
8546
the two phases.
8547
8548
Arguments:
8549
  options           option bits, including any changes for this subpattern
8550
  xoptions          extra option bits, ditto
8551
  codeptr           -> the address of the current code pointer
8552
  pptrptr           -> the address of the current parsed pattern pointer
8553
  errorcodeptr      -> pointer to error code variable
8554
  skipunits         skip this many code units at start (for brackets and OP_COND)
8555
  firstcuptr        place to put the first required code unit
8556
  firstcuflagsptr   place to put the first code unit flags
8557
  reqcuptr          place to put the last required code unit
8558
  reqcuflagsptr     place to put the last required code unit flags
8559
  bcptr             pointer to the chain of currently open branches
8560
  cb                points to the data block with tables pointers etc.
8561
  lengthptr         NULL during the real compile phase
8562
                    points to length accumulator during pre-compile phase
8563
8564
Returns:            0 There has been an error
8565
                   +1 Success, this group must match at least one character
8566
                   -1 Success, this group may match an empty string
8567
*/
8568
8569
static int
8570
compile_regex(uint32_t options, uint32_t xoptions, PCRE2_UCHAR **codeptr,
8571
  uint32_t **pptrptr, int *errorcodeptr, uint32_t skipunits,
8572
  uint32_t *firstcuptr, uint32_t *firstcuflagsptr, uint32_t *reqcuptr,
8573
  uint32_t *reqcuflagsptr, branch_chain *bcptr, open_capitem *open_caps,
8574
  compile_block *cb, PCRE2_SIZE *lengthptr)
8575
80.5k
{
8576
80.5k
PCRE2_UCHAR *code = *codeptr;
8577
80.5k
PCRE2_UCHAR *last_branch = code;
8578
80.5k
PCRE2_UCHAR *start_bracket = code;
8579
80.5k
BOOL lookbehind;
8580
80.5k
open_capitem capitem;
8581
80.5k
int capnumber = 0;
8582
80.5k
int okreturn = 1;
8583
80.5k
uint32_t *pptr = *pptrptr;
8584
80.5k
uint32_t firstcu, reqcu;
8585
80.5k
uint32_t lookbehindlength;
8586
80.5k
uint32_t lookbehindminlength;
8587
80.5k
uint32_t firstcuflags, reqcuflags;
8588
80.5k
PCRE2_SIZE length;
8589
80.5k
branch_chain bc;
8590
8591
/* If set, call the external function that checks for stack availability. */
8592
8593
80.5k
if (cb->cx->stack_guard != NULL &&
8594
0
    cb->cx->stack_guard(cb->parens_depth, cb->cx->stack_guard_data))
8595
0
  {
8596
0
  *errorcodeptr= ERR33;
8597
0
  cb->erroroffset = 0;
8598
0
  return 0;
8599
0
  }
8600
8601
/* Miscellaneous initialization */
8602
8603
80.5k
bc.outer = bcptr;
8604
80.5k
bc.current_branch = code;
8605
8606
80.5k
firstcu = reqcu = 0;
8607
80.5k
firstcuflags = reqcuflags = REQ_UNSET;
8608
8609
/* Accumulate the length for use in the pre-compile phase. Start with the
8610
length of the BRA and KET and any extra code units that are required at the
8611
beginning. We accumulate in a local variable to save frequent testing of
8612
lengthptr for NULL. We cannot do this by looking at the value of 'code' at the
8613
start and end of each alternative, because compiled items are discarded during
8614
the pre-compile phase so that the workspace is not exceeded. */
8615
8616
80.5k
length = 2 + 2*LINK_SIZE + skipunits;
8617
8618
/* Remember if this is a lookbehind assertion, and if it is, save its length
8619
and skip over the pattern offset. */
8620
8621
80.5k
lookbehind = *code == OP_ASSERTBACK ||
8622
79.8k
             *code == OP_ASSERTBACK_NOT ||
8623
78.5k
             *code == OP_ASSERTBACK_NA;
8624
8625
80.5k
if (lookbehind)
8626
2.02k
  {
8627
2.02k
  lookbehindlength = META_DATA(pptr[-1]);
8628
2.02k
  lookbehindminlength = *pptr;
8629
2.02k
  pptr += SIZEOFFSET;
8630
2.02k
  }
8631
78.5k
else lookbehindlength = lookbehindminlength = 0;
8632
8633
/* If this is a capturing subpattern, add to the chain of open capturing items
8634
so that we can detect them if (*ACCEPT) is encountered. Note that only OP_CBRA
8635
need be tested here; changing this opcode to one of its variants, e.g.
8636
OP_SCBRAPOS, happens later, after the group has been compiled. */
8637
8638
80.5k
if (*code == OP_CBRA)
8639
46.6k
  {
8640
46.6k
  capnumber = GET2(code, 1 + LINK_SIZE);
8641
46.6k
  capitem.number = capnumber;
8642
46.6k
  capitem.next = open_caps;
8643
46.6k
  capitem.assert_depth = cb->assert_depth;
8644
46.6k
  open_caps = &capitem;
8645
46.6k
  }
8646
8647
/* Offset is set zero to mark that this bracket is still open */
8648
8649
80.5k
PUT(code, 1, 0);
8650
80.5k
code += 1 + LINK_SIZE + skipunits;
8651
8652
/* Loop for each alternative branch */
8653
8654
80.5k
for (;;)
8655
101k
  {
8656
101k
  int branch_return;
8657
101k
  uint32_t branchfirstcu = 0, branchreqcu = 0;
8658
101k
  uint32_t branchfirstcuflags = REQ_UNSET, branchreqcuflags = REQ_UNSET;
8659
8660
  /* Insert OP_REVERSE or OP_VREVERSE if this is a lookbehind assertion. There
8661
  is only a single minimum length for the whole assertion. When the minimum
8662
  length is LOOKBEHIND_MAX it means that all branches are of fixed length,
8663
  though not necessarily the same length. In this case, the original OP_REVERSE
8664
  can be used. It can also be used if a branch in a variable length lookbehind
8665
  has the same maximum and minimum. Otherwise, use OP_VREVERSE, which has both
8666
  maximum and minimum values. */
8667
8668
101k
  if (lookbehind && lookbehindlength > 0)
8669
1.02k
    {
8670
1.02k
    if (lookbehindminlength == LOOKBEHIND_MAX ||
8671
104
        lookbehindminlength == lookbehindlength)
8672
920
      {
8673
920
      *code++ = OP_REVERSE;
8674
920
      PUT2INC(code, 0, lookbehindlength);
8675
920
      length += 1 + IMM2_SIZE;
8676
920
      }
8677
104
    else
8678
104
      {
8679
104
      *code++ = OP_VREVERSE;
8680
104
      PUT2INC(code, 0, lookbehindminlength);
8681
104
      PUT2INC(code, 0, lookbehindlength);
8682
104
      length += 1 + 2*IMM2_SIZE;
8683
104
      }
8684
1.02k
    }
8685
8686
  /* Now compile the branch; in the pre-compile phase its length gets added
8687
  into the length. */
8688
8689
101k
  if ((branch_return =
8690
101k
        compile_branch(&options, &xoptions, &code, &pptr, errorcodeptr,
8691
101k
          &branchfirstcu, &branchfirstcuflags, &branchreqcu, &branchreqcuflags,
8692
101k
          &bc, open_caps, cb, (lengthptr == NULL)? NULL : &length)) == 0)
8693
1.57k
    return 0;
8694
8695
  /* If a branch can match an empty string, so can the whole group. */
8696
8697
100k
  if (branch_return < 0) okreturn = -1;
8698
8699
  /* In the real compile phase, there is some post-processing to be done. */
8700
8701
100k
  if (lengthptr == NULL)
8702
49.5k
    {
8703
    /* If this is the first branch, the firstcu and reqcu values for the
8704
    branch become the values for the regex. */
8705
8706
49.5k
    if (*last_branch != OP_ALT)
8707
38.9k
      {
8708
38.9k
      firstcu = branchfirstcu;
8709
38.9k
      firstcuflags = branchfirstcuflags;
8710
38.9k
      reqcu = branchreqcu;
8711
38.9k
      reqcuflags = branchreqcuflags;
8712
38.9k
      }
8713
8714
    /* If this is not the first branch, the first char and reqcu have to
8715
    match the values from all the previous branches, except that if the
8716
    previous value for reqcu didn't have REQ_VARY set, it can still match,
8717
    and we set REQ_VARY for the group from this branch's value. */
8718
8719
10.6k
    else
8720
10.6k
      {
8721
      /* If we previously had a firstcu, but it doesn't match the new branch,
8722
      we have to abandon the firstcu for the regex, but if there was
8723
      previously no reqcu, it takes on the value of the old firstcu. */
8724
8725
10.6k
      if (firstcuflags != branchfirstcuflags || firstcu != branchfirstcu)
8726
5.13k
        {
8727
5.13k
        if (firstcuflags < REQ_NONE)
8728
1.10k
          {
8729
1.10k
          if (reqcuflags >= REQ_NONE)
8730
734
            {
8731
734
            reqcu = firstcu;
8732
734
            reqcuflags = firstcuflags;
8733
734
            }
8734
1.10k
          }
8735
5.13k
        firstcuflags = REQ_NONE;
8736
5.13k
        }
8737
8738
      /* If we (now or from before) have no firstcu, a firstcu from the
8739
      branch becomes a reqcu if there isn't a branch reqcu. */
8740
8741
10.6k
      if (firstcuflags >= REQ_NONE && branchfirstcuflags < REQ_NONE &&
8742
2.67k
          branchreqcuflags >= REQ_NONE)
8743
1.25k
        {
8744
1.25k
        branchreqcu = branchfirstcu;
8745
1.25k
        branchreqcuflags = branchfirstcuflags;
8746
1.25k
        }
8747
8748
      /* Now ensure that the reqcus match */
8749
8750
10.6k
      if (((reqcuflags & ~REQ_VARY) != (branchreqcuflags & ~REQ_VARY)) ||
8751
4.36k
          reqcu != branchreqcu)
8752
6.90k
        reqcuflags = REQ_NONE;
8753
3.72k
      else
8754
3.72k
        {
8755
3.72k
        reqcu = branchreqcu;
8756
3.72k
        reqcuflags |= branchreqcuflags; /* To "or" REQ_VARY if present */
8757
3.72k
        }
8758
10.6k
      }
8759
49.5k
    }
8760
8761
  /* Handle reaching the end of the expression, either ')' or end of pattern.
8762
  In the real compile phase, go back through the alternative branches and
8763
  reverse the chain of offsets, with the field in the BRA item now becoming an
8764
  offset to the first alternative. If there are no alternatives, it points to
8765
  the end of the group. The length in the terminating ket is always the length
8766
  of the whole bracketed item. Return leaving the pointer at the terminating
8767
  char. */
8768
8769
100k
  if (META_CODE(*pptr) != META_ALT)
8770
78.9k
    {
8771
78.9k
    if (lengthptr == NULL)
8772
38.9k
      {
8773
38.9k
      uint32_t branch_length = (uint32_t)(code - last_branch);
8774
38.9k
      do
8775
49.5k
        {
8776
49.5k
        uint32_t prev_length = GET(last_branch, 1);
8777
49.5k
        PUT(last_branch, 1, branch_length);
8778
49.5k
        branch_length = prev_length;
8779
49.5k
        last_branch -= branch_length;
8780
49.5k
        }
8781
49.5k
      while (branch_length > 0);
8782
38.9k
      }
8783
8784
    /* Fill in the ket */
8785
8786
78.9k
    *code = OP_KET;
8787
78.9k
    PUT(code, 1, (uint32_t)(code - start_bracket));
8788
78.9k
    code += 1 + LINK_SIZE;
8789
8790
    /* Set values to pass back */
8791
8792
78.9k
    *codeptr = code;
8793
78.9k
    *pptrptr = pptr;
8794
78.9k
    *firstcuptr = firstcu;
8795
78.9k
    *firstcuflagsptr = firstcuflags;
8796
78.9k
    *reqcuptr = reqcu;
8797
78.9k
    *reqcuflagsptr = reqcuflags;
8798
78.9k
    if (lengthptr != NULL)
8799
40.0k
      {
8800
40.0k
      if (OFLOW_MAX - *lengthptr < length)
8801
0
        {
8802
0
        *errorcodeptr = ERR20;
8803
0
        return 0;
8804
0
        }
8805
40.0k
      *lengthptr += length;
8806
40.0k
      }
8807
78.9k
    return okreturn;
8808
78.9k
    }
8809
8810
  /* Another branch follows. In the pre-compile phase, we can move the code
8811
  pointer back to where it was for the start of the first branch. (That is,
8812
  pretend that each branch is the only one.)
8813
8814
  In the real compile phase, insert an ALT node. Its length field points back
8815
  to the previous branch while the bracket remains open. At the end the chain
8816
  is reversed. It's done like this so that the start of the bracket has a
8817
  zero offset until it is closed, making it possible to detect recursion. */
8818
8819
21.2k
  if (lengthptr != NULL)
8820
10.6k
    {
8821
10.6k
    code = *codeptr + 1 + LINK_SIZE + skipunits;
8822
10.6k
    length += 1 + LINK_SIZE;
8823
10.6k
    }
8824
10.6k
  else
8825
10.6k
    {
8826
10.6k
    *code = OP_ALT;
8827
10.6k
    PUT(code, 1, (int)(code - last_branch));
8828
10.6k
    bc.current_branch = last_branch = code;
8829
10.6k
    code += 1 + LINK_SIZE;
8830
10.6k
    }
8831
8832
  /* Set the maximum lookbehind length for the next branch (if not in a
8833
  lookbehind the value will be zero) and then advance past the vertical bar. */
8834
8835
21.2k
  lookbehindlength = META_DATA(*pptr);
8836
21.2k
  pptr++;
8837
21.2k
  }
8838
8839
/* LCOV_EXCL_START */
8840
0
PCRE2_DEBUG_UNREACHABLE(); /* Control should never reach here */
8841
0
return 0;                  /* Avoid compiler warnings */
8842
/* LCOV_EXCL_STOP */
8843
80.5k
}
8844
8845
8846
8847
/*************************************************
8848
*          Check for anchored pattern            *
8849
*************************************************/
8850
8851
/* Try to find out if this is an anchored regular expression. Consider each
8852
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
8853
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
8854
it's anchored. However, if this is a multiline pattern, then only OP_SOD will
8855
be found, because ^ generates OP_CIRCM in that mode.
8856
8857
We can also consider a regex to be anchored if OP_SOM starts all its branches.
8858
This is the code for \G, which means "match at start of match position, taking
8859
into account the match offset".
8860
8861
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
8862
because that will try the rest of the pattern at all possible matching points,
8863
so there is no point trying again.... er ....
8864
8865
.... except when the .* appears inside capturing parentheses, and there is a
8866
subsequent back reference to those parentheses. We haven't enough information
8867
to catch that case precisely.
8868
8869
At first, the best we could do was to detect when .* was in capturing brackets
8870
and the highest back reference was greater than or equal to that level.
8871
However, by keeping a bitmap of the first 31 back references, we can catch some
8872
of the more common cases more precisely.
8873
8874
... A second exception is when the .* appears inside an atomic group, because
8875
this prevents the number of characters it matches from being adjusted.
8876
8877
Arguments:
8878
  code           points to start of the compiled pattern
8879
  bracket_map    a bitmap of which brackets we are inside while testing; this
8880
                   handles up to substring 31; after that we just have to take
8881
                   the less precise approach
8882
  cb             points to the compile data block
8883
  atomcount      atomic group level
8884
  inassert       TRUE if in an assertion
8885
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8886
8887
Returns:     TRUE or FALSE
8888
*/
8889
8890
static BOOL
8891
is_anchored(PCRE2_SPTR code, uint32_t bracket_map, compile_block *cb,
8892
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8893
13.6k
{
8894
15.4k
do {
8895
15.4k
   PCRE2_SPTR scode = first_significant_code(
8896
15.4k
     code + PRIV(OP_lengths)[*code], FALSE);
8897
15.4k
   int op = *scode;
8898
8899
   /* Non-capturing brackets */
8900
8901
15.4k
   if (op == OP_BRA  || op == OP_BRAPOS ||
8902
14.3k
       op == OP_SBRA || op == OP_SBRAPOS)
8903
1.23k
     {
8904
1.23k
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8905
814
       return FALSE;
8906
1.23k
     }
8907
8908
   /* Capturing brackets */
8909
8910
14.1k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
8911
10.5k
            op == OP_SCBRA || op == OP_SCBRAPOS)
8912
3.91k
     {
8913
3.91k
     int n = GET2(scode, 1+LINK_SIZE);
8914
3.91k
     uint32_t new_map = bracket_map | ((n < 32)? (1u << n) : 1);
8915
3.91k
     if (!is_anchored(scode, new_map, cb, atomcount, inassert, dotstar_anchor)) return FALSE;
8916
3.91k
     }
8917
8918
   /* Positive forward assertion */
8919
8920
10.2k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
8921
815
     {
8922
815
     if (!is_anchored(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor)) return FALSE;
8923
815
     }
8924
8925
   /* Condition. If there is no second branch, it can't be anchored. */
8926
8927
9.46k
   else if (op == OP_COND || op == OP_SCOND)
8928
587
     {
8929
587
     if (scode[GET(scode,1)] != OP_ALT) return FALSE;
8930
376
     if (!is_anchored(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
8931
342
       return FALSE;
8932
376
     }
8933
8934
   /* Atomic groups */
8935
8936
8.87k
   else if (op == OP_ONCE)
8937
1.11k
     {
8938
1.11k
     if (!is_anchored(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
8939
867
       return FALSE;
8940
1.11k
     }
8941
8942
   /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
8943
   it isn't in brackets that are or may be referenced or inside an atomic
8944
   group or an assertion. Also the pattern must not contain *PRUNE or *SKIP,
8945
   because these break the feature. Consider, for example, /(?s).*?(*PRUNE)b/
8946
   with the subject "aab", which matches "b", i.e. not at the start of a line.
8947
   There is also an option that disables auto-anchoring. */
8948
8949
7.75k
   else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
8950
7.38k
             op == OP_TYPEPOSSTAR))
8951
795
     {
8952
795
     if (scode[1] != OP_ALLANY || (bracket_map & cb->backref_map) != 0 ||
8953
620
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
8954
177
       return FALSE;
8955
795
     }
8956
8957
   /* Check for explicit anchoring */
8958
8959
6.96k
   else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
8960
8961
3.06k
   code += GET(code, 1);
8962
3.06k
   }
8963
13.6k
while (*code == OP_ALT);   /* Loop for each alternative */
8964
1.33k
return TRUE;
8965
13.6k
}
8966
8967
8968
8969
/*************************************************
8970
*         Check for starting with ^ or .*        *
8971
*************************************************/
8972
8973
/* This is called to find out if every branch starts with ^ or .* so that
8974
"first char" processing can be done to speed things up in multiline
8975
matching and for non-DOTALL patterns that start with .* (which must start at
8976
the beginning or after \n). As in the case of is_anchored() (see above), we
8977
have to take account of back references to capturing brackets that contain .*
8978
because in that case we can't make the assumption. Also, the appearance of .*
8979
inside atomic brackets or in an assertion, or in a pattern that contains *PRUNE
8980
or *SKIP does not count, because once again the assumption no longer holds.
8981
8982
Arguments:
8983
  code           points to start of the compiled pattern or a group
8984
  bracket_map    a bitmap of which brackets we are inside while testing; this
8985
                   handles up to substring 31; after that we just have to take
8986
                   the less precise approach
8987
  cb             points to the compile data
8988
  atomcount      atomic group level
8989
  inassert       TRUE if in an assertion
8990
  dotstar_anchor TRUE if automatic anchoring optimization is enabled
8991
8992
Returns:         TRUE or FALSE
8993
*/
8994
8995
static BOOL
8996
is_startline(PCRE2_SPTR code, unsigned int bracket_map, compile_block *cb,
8997
  int atomcount, BOOL inassert, BOOL dotstar_anchor)
8998
10.1k
{
8999
11.9k
do {
9000
11.9k
   PCRE2_SPTR scode = first_significant_code(
9001
11.9k
     code + PRIV(OP_lengths)[*code], FALSE);
9002
11.9k
   int op = *scode;
9003
9004
   /* If we are at the start of a conditional assertion group, *both* the
9005
   conditional assertion *and* what follows the condition must satisfy the test
9006
   for start of line. Other kinds of condition fail. Note that there may be an
9007
   auto-callout at the start of a condition. */
9008
9009
11.9k
   if (op == OP_COND)
9010
483
     {
9011
483
     scode += 1 + LINK_SIZE;
9012
9013
483
     if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
9014
332
       else if (*scode == OP_CALLOUT_STR) scode += GET(scode, 1 + 2*LINK_SIZE);
9015
9016
483
     switch (*scode)
9017
483
       {
9018
88
       case OP_CREF:
9019
89
       case OP_DNCREF:
9020
160
       case OP_RREF:
9021
161
       case OP_DNRREF:
9022
174
       case OP_FAIL:
9023
174
       case OP_FALSE:
9024
174
       case OP_TRUE:
9025
174
       return FALSE;
9026
9027
309
       default:     /* Assertion */
9028
309
       if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9029
230
         return FALSE;
9030
195
       do scode += GET(scode, 1); while (*scode == OP_ALT);
9031
79
       scode += 1 + LINK_SIZE;
9032
79
       break;
9033
483
       }
9034
79
     scode = first_significant_code(scode, FALSE);
9035
79
     op = *scode;
9036
79
     }
9037
9038
   /* Non-capturing brackets */
9039
9040
11.5k
   if (op == OP_BRA  || op == OP_BRAPOS ||
9041
10.7k
       op == OP_SBRA || op == OP_SBRAPOS)
9042
902
     {
9043
902
     if (!is_startline(scode, bracket_map, cb, atomcount, inassert, dotstar_anchor))
9044
659
       return FALSE;
9045
902
     }
9046
9047
   /* Capturing brackets */
9048
9049
10.6k
   else if (op == OP_CBRA  || op == OP_CBRAPOS ||
9050
7.85k
            op == OP_SCBRA || op == OP_SCBRAPOS)
9051
3.03k
     {
9052
3.03k
     int n = GET2(scode, 1+LINK_SIZE);
9053
3.03k
     unsigned int new_map = bracket_map | ((n < 32)? (1u << n) : 1);
9054
3.03k
     if (!is_startline(scode, new_map, cb, atomcount, inassert, dotstar_anchor))
9055
2.76k
       return FALSE;
9056
3.03k
     }
9057
9058
   /* Positive forward assertions */
9059
9060
7.57k
   else if (op == OP_ASSERT || op == OP_ASSERT_NA)
9061
395
     {
9062
395
     if (!is_startline(scode, bracket_map, cb, atomcount, TRUE, dotstar_anchor))
9063
193
       return FALSE;
9064
395
     }
9065
9066
   /* Atomic brackets */
9067
9068
7.18k
   else if (op == OP_ONCE)
9069
650
     {
9070
650
     if (!is_startline(scode, bracket_map, cb, atomcount + 1, inassert, dotstar_anchor))
9071
453
       return FALSE;
9072
650
     }
9073
9074
   /* .* means "start at start or after \n" if it isn't in atomic brackets or
9075
   brackets that may be referenced or an assertion, and as long as the pattern
9076
   does not contain *PRUNE or *SKIP, because these break the feature. Consider,
9077
   for example, /.*?a(*PRUNE)b/ with the subject "aab", which matches "ab",
9078
   i.e. not at the start of a line. There is also an option that disables this
9079
   optimization. */
9080
9081
6.53k
   else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
9082
889
     {
9083
889
     if (scode[1] != OP_ANY || (bracket_map & cb->backref_map) != 0 ||
9084
803
         atomcount > 0 || cb->had_pruneorskip || inassert || !dotstar_anchor)
9085
107
       return FALSE;
9086
889
     }
9087
9088
   /* Check for explicit circumflex; anything else gives a FALSE result. Note
9089
   in particular that this includes atomic brackets OP_ONCE because the number
9090
   of characters matched by .* cannot be adjusted inside them. */
9091
9092
5.64k
   else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
9093
9094
   /* Move on to the next alternative */
9095
9096
2.85k
   code += GET(code, 1);
9097
2.85k
   }
9098
10.1k
while (*code == OP_ALT);  /* Loop for each alternative */
9099
1.06k
return TRUE;
9100
10.1k
}
9101
9102
9103
9104
/*************************************************
9105
*   Scan compiled regex for recursion reference  *
9106
*************************************************/
9107
9108
/* This function scans through a compiled pattern until it finds an instance of
9109
OP_RECURSE.
9110
9111
Arguments:
9112
  code        points to start of expression
9113
  utf         TRUE in UTF mode
9114
9115
Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
9116
*/
9117
9118
static PCRE2_UCHAR *
9119
find_recurse(PCRE2_UCHAR *code, BOOL utf)
9120
99.6k
{
9121
99.6k
for (;;)
9122
441k
  {
9123
441k
  PCRE2_UCHAR c = *code;
9124
441k
  if (c == OP_END) return NULL;
9125
441k
  if (c == OP_RECURSE) return code;
9126
9127
  /* XCLASS is used for classes that cannot be represented just by a bit map.
9128
  This includes negated single high-valued characters. ECLASS is used for
9129
  classes that use set operations internally. CALLOUT_STR is used for
9130
  callouts with string arguments. In each case the length in the table is
9131
  zero; the actual length is stored in the compiled code. */
9132
9133
341k
  if (c == OP_XCLASS || c == OP_ECLASS) code += GET(code, 1);
9134
341k
  else if (c == OP_CALLOUT_STR) code += GET(code, 1 + 2*LINK_SIZE);
9135
9136
  /* Otherwise, we can get the item's length from the table, except that for
9137
  repeated character types, we have to test for \p and \P, which have an extra
9138
  two code units of parameters, and for MARK/PRUNE/SKIP/THEN with an argument,
9139
  we must add in its length. */
9140
9141
341k
  else
9142
341k
    {
9143
341k
    switch(c)
9144
341k
      {
9145
41
      case OP_TYPESTAR:
9146
62
      case OP_TYPEMINSTAR:
9147
64
      case OP_TYPEPLUS:
9148
65
      case OP_TYPEMINPLUS:
9149
68
      case OP_TYPEQUERY:
9150
69
      case OP_TYPEMINQUERY:
9151
70
      case OP_TYPEPOSSTAR:
9152
71
      case OP_TYPEPOSPLUS:
9153
72
      case OP_TYPEPOSQUERY:
9154
72
      if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
9155
72
      break;
9156
9157
1
      case OP_TYPEPOSUPTO:
9158
2
      case OP_TYPEUPTO:
9159
2
      case OP_TYPEMINUPTO:
9160
7
      case OP_TYPEEXACT:
9161
7
      if (code[1 + IMM2_SIZE] == OP_PROP || code[1 + IMM2_SIZE] == OP_NOTPROP)
9162
0
        code += 2;
9163
7
      break;
9164
9165
1
      case OP_MARK:
9166
1
      case OP_COMMIT_ARG:
9167
4
      case OP_PRUNE_ARG:
9168
4
      case OP_SKIP_ARG:
9169
5
      case OP_THEN_ARG:
9170
5
      code += code[1];
9171
5
      break;
9172
341k
      }
9173
9174
    /* Add in the fixed length from the table */
9175
9176
341k
    code += PRIV(OP_lengths)[c];
9177
9178
    /* In UTF-8 and UTF-16 modes, opcodes that are followed by a character may
9179
    be followed by a multi-unit character. The length in the table is a
9180
    minimum, so we have to arrange to skip the extra units. */
9181
9182
341k
#ifdef MAYBE_UTF_MULTI
9183
341k
    if (utf) switch(c)
9184
0
      {
9185
0
      case OP_CHAR:
9186
0
      case OP_CHARI:
9187
0
      case OP_NOT:
9188
0
      case OP_NOTI:
9189
0
      case OP_EXACT:
9190
0
      case OP_EXACTI:
9191
0
      case OP_NOTEXACT:
9192
0
      case OP_NOTEXACTI:
9193
0
      case OP_UPTO:
9194
0
      case OP_UPTOI:
9195
0
      case OP_NOTUPTO:
9196
0
      case OP_NOTUPTOI:
9197
0
      case OP_MINUPTO:
9198
0
      case OP_MINUPTOI:
9199
0
      case OP_NOTMINUPTO:
9200
0
      case OP_NOTMINUPTOI:
9201
0
      case OP_POSUPTO:
9202
0
      case OP_POSUPTOI:
9203
0
      case OP_NOTPOSUPTO:
9204
0
      case OP_NOTPOSUPTOI:
9205
0
      case OP_STAR:
9206
0
      case OP_STARI:
9207
0
      case OP_NOTSTAR:
9208
0
      case OP_NOTSTARI:
9209
0
      case OP_MINSTAR:
9210
0
      case OP_MINSTARI:
9211
0
      case OP_NOTMINSTAR:
9212
0
      case OP_NOTMINSTARI:
9213
0
      case OP_POSSTAR:
9214
0
      case OP_POSSTARI:
9215
0
      case OP_NOTPOSSTAR:
9216
0
      case OP_NOTPOSSTARI:
9217
0
      case OP_PLUS:
9218
0
      case OP_PLUSI:
9219
0
      case OP_NOTPLUS:
9220
0
      case OP_NOTPLUSI:
9221
0
      case OP_MINPLUS:
9222
0
      case OP_MINPLUSI:
9223
0
      case OP_NOTMINPLUS:
9224
0
      case OP_NOTMINPLUSI:
9225
0
      case OP_POSPLUS:
9226
0
      case OP_POSPLUSI:
9227
0
      case OP_NOTPOSPLUS:
9228
0
      case OP_NOTPOSPLUSI:
9229
0
      case OP_QUERY:
9230
0
      case OP_QUERYI:
9231
0
      case OP_NOTQUERY:
9232
0
      case OP_NOTQUERYI:
9233
0
      case OP_MINQUERY:
9234
0
      case OP_MINQUERYI:
9235
0
      case OP_NOTMINQUERY:
9236
0
      case OP_NOTMINQUERYI:
9237
0
      case OP_POSQUERY:
9238
0
      case OP_POSQUERYI:
9239
0
      case OP_NOTPOSQUERY:
9240
0
      case OP_NOTPOSQUERYI:
9241
0
      if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
9242
0
      break;
9243
0
      }
9244
#else
9245
    (void)(utf);  /* Keep compiler happy by referencing function argument */
9246
#endif  /* MAYBE_UTF_MULTI */
9247
341k
    }
9248
341k
  }
9249
99.6k
}
9250
9251
9252
9253
/*************************************************
9254
*    Check for asserted fixed first code unit    *
9255
*************************************************/
9256
9257
/* During compilation, the "first code unit" settings from forward assertions
9258
are discarded, because they can cause conflicts with actual literals that
9259
follow. However, if we end up without a first code unit setting for an
9260
unanchored pattern, it is worth scanning the regex to see if there is an
9261
initial asserted first code unit. If all branches start with the same asserted
9262
code unit, or with a non-conditional bracket all of whose alternatives start
9263
with the same asserted code unit (recurse ad lib), then we return that code
9264
unit, with the flags set to zero or REQ_CASELESS; otherwise return zero with
9265
REQ_NONE in the flags.
9266
9267
Arguments:
9268
  code       points to start of compiled pattern
9269
  flags      points to the first code unit flags
9270
  inassert   non-zero if in an assertion
9271
9272
Returns:     the fixed first code unit, or 0 with REQ_NONE in flags
9273
*/
9274
9275
static uint32_t
9276
find_firstassertedcu(PCRE2_SPTR code, uint32_t *flags, uint32_t inassert)
9277
10.3k
{
9278
10.3k
uint32_t c = 0;
9279
10.3k
uint32_t cflags = REQ_NONE;
9280
9281
10.3k
*flags = REQ_NONE;
9282
12.1k
do {
9283
12.1k
   uint32_t d;
9284
12.1k
   uint32_t dflags;
9285
12.1k
   int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
9286
9.26k
             *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
9287
12.1k
   PCRE2_SPTR scode = first_significant_code(code + 1+LINK_SIZE + xl, TRUE);
9288
12.1k
   PCRE2_UCHAR op = *scode;
9289
9290
12.1k
   switch(op)
9291
12.1k
     {
9292
4.69k
     default:
9293
4.69k
     return 0;
9294
9295
815
     case OP_BRA:
9296
861
     case OP_BRAPOS:
9297
3.54k
     case OP_CBRA:
9298
3.77k
     case OP_SCBRA:
9299
3.84k
     case OP_CBRAPOS:
9300
3.89k
     case OP_SCBRAPOS:
9301
4.80k
     case OP_ASSERT:
9302
4.80k
     case OP_ASSERT_NA:
9303
5.35k
     case OP_ONCE:
9304
5.35k
     case OP_SCRIPT_RUN:
9305
5.35k
     d = find_firstassertedcu(scode, &dflags, inassert +
9306
5.35k
       ((op == OP_ASSERT || op == OP_ASSERT_NA)?1:0));
9307
5.35k
     if (dflags >= REQ_NONE) return 0;
9308
881
     if (cflags >= REQ_NONE) { c = d; cflags = dflags; }
9309
353
       else if (c != d || cflags != dflags) return 0;
9310
868
     break;
9311
9312
868
     case OP_EXACT:
9313
200
     scode += IMM2_SIZE;
9314
200
     PCRE2_FALLTHROUGH /* Fall through */
9315
9316
1.39k
     case OP_CHAR:
9317
1.60k
     case OP_PLUS:
9318
1.80k
     case OP_MINPLUS:
9319
2.01k
     case OP_POSPLUS:
9320
2.01k
     if (inassert == 0) return 0;
9321
1.89k
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = 0; }
9322
1.43k
       else if (c != scode[1]) return 0;
9323
1.88k
     break;
9324
9325
1.88k
     case OP_EXACTI:
9326
13
     scode += IMM2_SIZE;
9327
13
     PCRE2_FALLTHROUGH /* Fall through */
9328
9329
65
     case OP_CHARI:
9330
79
     case OP_PLUSI:
9331
93
     case OP_MINPLUSI:
9332
107
     case OP_POSPLUSI:
9333
107
     if (inassert == 0) return 0;
9334
9335
     /* If the character is more than one code unit long, we cannot set its
9336
     first code unit when matching caselessly. Later scanning may pick up
9337
     multiple code units. */
9338
9339
71
#ifdef SUPPORT_UNICODE
9340
71
#if PCRE2_CODE_UNIT_WIDTH == 8
9341
71
     if (scode[1] >= 0x80) return 0;
9342
#elif PCRE2_CODE_UNIT_WIDTH == 16
9343
     if (scode[1] >= 0xd800 && scode[1] <= 0xdfff) return 0;
9344
#endif
9345
29
#endif
9346
9347
29
     if (cflags >= REQ_NONE) { c = scode[1]; cflags = REQ_CASELESS; }
9348
1
       else if (c != scode[1]) return 0;
9349
28
     break;
9350
12.1k
     }
9351
9352
2.77k
   code += GET(code, 1);
9353
2.77k
   }
9354
10.3k
while (*code == OP_ALT);
9355
9356
938
*flags = cflags;
9357
938
return c;
9358
10.3k
}
9359
9360
9361
9362
/*************************************************
9363
*             Skip in parsed pattern             *
9364
*************************************************/
9365
9366
/* This function is called to skip parts of the parsed pattern when finding the
9367
length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find
9368
the end of the branch, it is called to skip over an internal lookaround or
9369
(DEFINE) group, and it is also called to skip to the end of a class, during
9370
which it will never encounter nested groups (but there's no need to have
9371
special code for that).
9372
9373
When called to find the end of a branch or group, pptr must point to the first
9374
meta code inside the branch, not the branch-starting code. In other cases it
9375
can point to the item that causes the function to be called.
9376
9377
Arguments:
9378
  pptr       current pointer to skip from
9379
  skiptype   PSKIP_CLASS when skipping to end of class
9380
             PSKIP_ALT when META_ALT ends the skip
9381
             PSKIP_KET when only META_KET ends the skip
9382
9383
Returns:     new value of pptr
9384
             NULL if META_END is reached - should never occur
9385
               or for an unknown meta value - likewise
9386
*/
9387
9388
static uint32_t *
9389
parsed_skip(uint32_t *pptr, uint32_t skiptype)
9390
465
{
9391
465
uint32_t nestlevel = 0;
9392
9393
5.96k
for (;; pptr++)
9394
6.43k
  {
9395
6.43k
  uint32_t meta = META_CODE(*pptr);
9396
9397
6.43k
  switch(meta)
9398
6.43k
    {
9399
4.51k
    default:  /* Just skip over most items */
9400
4.51k
    if (meta < META_END) continue;  /* Literal */
9401
624
    break;
9402
9403
    /* The parsed regex is malformed; we have reached the end and did
9404
    not find the end of the construct which we are skipping over. */
9405
9406
    /* LCOV_EXCL_START */
9407
624
    case META_END:
9408
0
    PCRE2_DEBUG_UNREACHABLE();
9409
0
    return NULL;
9410
    /* LCOV_EXCL_STOP */
9411
9412
    /* The data for these items is variable in length. */
9413
9414
27
    case META_BACKREF:  /* Offset is present only if group >= 10 */
9415
27
    if (META_DATA(*pptr) >= 10) pptr += SIZEOFFSET;
9416
27
    break;
9417
9418
10
    case META_ESCAPE:
9419
10
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
9420
0
      pptr += 1;     /* Skip prop data */
9421
10
    break;
9422
9423
0
    case META_MARK:     /* Add the length of the name. */
9424
0
    case META_COMMIT_ARG:
9425
0
    case META_PRUNE_ARG:
9426
0
    case META_SKIP_ARG:
9427
0
    case META_THEN_ARG:
9428
0
    pptr += pptr[1];
9429
0
    break;
9430
9431
    /* These are the "active" items in this loop. */
9432
9433
146
    case META_CLASS_END:
9434
146
    if (skiptype == PSKIP_CLASS) return pptr;
9435
2
    break;
9436
9437
2
    case META_ATOMIC:
9438
675
    case META_CAPTURE:
9439
675
    case META_COND_ASSERT:
9440
675
    case META_COND_DEFINE:
9441
675
    case META_COND_NAME:
9442
680
    case META_COND_NUMBER:
9443
680
    case META_COND_RNAME:
9444
680
    case META_COND_RNUMBER:
9445
680
    case META_COND_VERSION:
9446
680
    case META_SCS:
9447
680
    case META_LOOKAHEAD:
9448
680
    case META_LOOKAHEADNOT:
9449
680
    case META_LOOKAHEAD_NA:
9450
682
    case META_LOOKBEHIND:
9451
687
    case META_LOOKBEHINDNOT:
9452
687
    case META_LOOKBEHIND_NA:
9453
687
    case META_NOCAPTURE:
9454
687
    case META_SCRIPT_RUN:
9455
687
    nestlevel++;
9456
687
    break;
9457
9458
41
    case META_ALT:
9459
41
    if (nestlevel == 0 && skiptype == PSKIP_ALT) return pptr;
9460
41
    break;
9461
9462
1.00k
    case META_KET:
9463
1.00k
    if (nestlevel == 0) return pptr;
9464
687
    nestlevel--;
9465
687
    break;
9466
6.43k
    }
9467
9468
  /* The extra data item length for each meta is in a table. */
9469
9470
2.07k
  meta = (meta >> 16) & 0x7fff;
9471
2.07k
  if (meta >= sizeof(meta_extra_lengths)) return NULL;
9472
2.07k
  pptr += meta_extra_lengths[meta];
9473
2.07k
  }
9474
9475
/* LCOV_EXCL_START */
9476
465
PCRE2_UNREACHABLE(); /* Control never reaches here */
9477
/* LCOV_EXCL_STOP */
9478
465
}
9479
9480
9481
9482
/*************************************************
9483
*       Find length of a parsed group            *
9484
*************************************************/
9485
9486
/* This is called for nested groups within a branch of a lookbehind whose
9487
length is being computed. On entry, the pointer must be at the first element
9488
after the group initializing code. On exit it points to OP_KET. Caching is used
9489
to improve processing speed when the same capturing group occurs many times.
9490
9491
Arguments:
9492
  pptrptr     pointer to pointer in the parsed pattern
9493
  minptr      where to return the minimum length
9494
  isinline    FALSE if a reference or recursion; TRUE for inline group
9495
  errcodeptr  pointer to the errorcode
9496
  lcptr       pointer to the loop counter
9497
  group       number of captured group or -1 for a non-capturing group
9498
  recurses    chain of recurse_check to catch mutual recursion
9499
  cb          pointer to the compile data
9500
9501
Returns:      the maximum group length or a negative number
9502
*/
9503
9504
static int
9505
get_grouplength(uint32_t **pptrptr, int *minptr, BOOL isinline, int *errcodeptr,
9506
  int *lcptr, int group, parsed_recurse_check *recurses, compile_block *cb)
9507
1.75k
{
9508
1.75k
uint32_t *gi = cb->groupinfo + 2 * group;
9509
1.75k
int branchlength, branchminlength;
9510
1.75k
int grouplength = -1;
9511
1.75k
int groupminlength = INT_MAX;
9512
9513
/* The cache can be used only if there is no possibility of there being two
9514
groups with the same number. We do not need to set the end pointer for a group
9515
that is being processed as a back reference or recursion, but we must do so for
9516
an inline group. */
9517
9518
1.75k
if (group > 0 && (cb->external_flags & PCRE2_DUPCAPUSED) == 0)
9519
1.61k
  {
9520
1.61k
  uint32_t groupinfo = gi[0];
9521
1.61k
  if ((groupinfo & GI_NOT_FIXED_LENGTH) != 0) return -1;
9522
1.61k
  if ((groupinfo & GI_SET_FIXED_LENGTH) != 0)
9523
94
    {
9524
94
    if (isinline) *pptrptr = parsed_skip(*pptrptr, PSKIP_KET);
9525
94
    *minptr = gi[1];
9526
94
    return groupinfo & GI_FIXED_LENGTH_MASK;
9527
94
    }
9528
1.61k
  }
9529
9530
/* Scan the group. In this case we find the end pointer of necessity. */
9531
9532
1.66k
for(;;)
9533
1.72k
  {
9534
1.72k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
9535
1.72k
    recurses, cb);
9536
1.72k
  if (branchlength < 0) goto ISNOTFIXED;
9537
1.40k
  if (branchlength > grouplength) grouplength = branchlength;
9538
1.40k
  if (branchminlength < groupminlength) groupminlength = branchminlength;
9539
1.40k
  if (**pptrptr == META_KET) break;
9540
65
  *pptrptr += 1;   /* Skip META_ALT */
9541
65
  }
9542
9543
1.34k
if (group > 0)
9544
1.23k
  {
9545
1.23k
  gi[0] |= (uint32_t)(GI_SET_FIXED_LENGTH | grouplength);
9546
1.23k
  gi[1] = groupminlength;
9547
1.23k
  }
9548
9549
1.34k
*minptr = groupminlength;
9550
1.34k
return grouplength;
9551
9552
319
ISNOTFIXED:
9553
319
if (group > 0) gi[0] |= GI_NOT_FIXED_LENGTH;
9554
319
return -1;
9555
1.66k
}
9556
9557
9558
9559
/*************************************************
9560
*        Find length of a parsed branch          *
9561
*************************************************/
9562
9563
/* Return fixed maximum and minimum lengths for a branch in a lookbehind,
9564
giving an error if the length is not limited. On entry, *pptrptr points to the
9565
first element inside the branch. On exit it is set to point to the ALT or KET.
9566
9567
Arguments:
9568
  pptrptr     pointer to pointer in the parsed pattern
9569
  minptr      where to return the minimum length
9570
  errcodeptr  pointer to error code
9571
  lcptr       pointer to loop counter
9572
  recurses    chain of recurse_check to catch mutual recursion
9573
  cb          pointer to compile block
9574
9575
Returns:      the maximum length, or a negative value on error
9576
*/
9577
9578
static int
9579
get_branchlength(uint32_t **pptrptr, int *minptr, int *errcodeptr, int *lcptr,
9580
  parsed_recurse_check *recurses, compile_block *cb)
9581
3.43k
{
9582
3.43k
int branchlength = 0;
9583
3.43k
int branchminlength = 0;
9584
3.43k
int grouplength, groupminlength;
9585
3.43k
uint32_t lastitemlength = 0;
9586
3.43k
uint32_t lastitemminlength = 0;
9587
3.43k
uint32_t *pptr = *pptrptr;
9588
3.43k
PCRE2_SIZE offset;
9589
3.43k
parsed_recurse_check this_recurse;
9590
9591
/* A large and/or complex regex can take too long to process. This can happen
9592
more often when (?| groups are present in the pattern because their length
9593
cannot be cached. */
9594
9595
3.43k
if ((*lcptr)++ > 2000)
9596
0
  {
9597
0
  *errcodeptr = ERR35;  /* Lookbehind is too complicated */
9598
0
  return -1;
9599
0
  }
9600
9601
/* Scan the branch, accumulating the length. */
9602
9603
9.37k
for (;; pptr++)
9604
12.8k
  {
9605
12.8k
  parsed_recurse_check *r;
9606
12.8k
  uint32_t *gptr, *gptrend;
9607
12.8k
  uint32_t escape;
9608
12.8k
  uint32_t min, max;
9609
12.8k
  uint32_t group = 0;
9610
12.8k
  uint32_t itemlength = 0;
9611
12.8k
  uint32_t itemminlength = 0;
9612
9613
12.8k
  if (*pptr < META_END)
9614
6.06k
    {
9615
6.06k
    itemlength = itemminlength = 1;
9616
6.06k
    }
9617
9618
6.74k
  else switch (META_CODE(*pptr))
9619
6.74k
    {
9620
2.35k
    case META_KET:
9621
2.75k
    case META_ALT:
9622
2.75k
    goto EXIT;
9623
9624
    /* (*ACCEPT) and (*FAIL) terminate the branch, but we must skip to the
9625
    actual termination. */
9626
9627
18
    case META_ACCEPT:
9628
22
    case META_FAIL:
9629
22
    pptr = parsed_skip(pptr, PSKIP_ALT);
9630
22
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9631
22
    goto EXIT;
9632
9633
22
    case META_MARK:
9634
8
    case META_COMMIT_ARG:
9635
16
    case META_PRUNE_ARG:
9636
24
    case META_SKIP_ARG:
9637
32
    case META_THEN_ARG:
9638
32
    pptr += pptr[1] + 1;
9639
32
    break;
9640
9641
47
    case META_CIRCUMFLEX:
9642
55
    case META_COMMIT:
9643
98
    case META_DOLLAR:
9644
106
    case META_PRUNE:
9645
114
    case META_SKIP:
9646
122
    case META_THEN:
9647
122
    break;
9648
9649
44
    case META_OPTIONS:
9650
44
    pptr += 2;
9651
44
    break;
9652
9653
0
    case META_BIGVALUE:
9654
0
    itemlength = itemminlength = 1;
9655
0
    pptr += 1;
9656
0
    break;
9657
9658
83
    case META_CLASS:
9659
144
    case META_CLASS_NOT:
9660
144
    itemlength = itemminlength = 1;
9661
144
    pptr = parsed_skip(pptr, PSKIP_CLASS);
9662
144
    if (pptr == NULL) goto PARSED_SKIP_FAILED;
9663
144
    break;
9664
9665
144
    case META_CLASS_EMPTY_NOT:
9666
111
    case META_DOT:
9667
111
    itemlength = itemminlength = 1;
9668
111
    break;
9669
9670
10
    case META_CALLOUT_NUMBER:
9671
10
    pptr += 3;
9672
10
    break;
9673
9674
0
    case META_CALLOUT_STRING:
9675
0
    pptr += 3 + SIZEOFFSET;
9676
0
    break;
9677
9678
    /* Only some escapes consume a character. Of those, \R can match one or two
9679
    characters, but \X is never allowed because it matches an unknown number of
9680
    characters. \C is allowed only in 32-bit and non-UTF 8/16-bit modes. */
9681
9682
373
    case META_ESCAPE:
9683
373
    escape = META_DATA(*pptr);
9684
373
    if (escape == ESC_X) return -1;
9685
373
    if (escape == ESC_R)
9686
2
      {
9687
2
      itemminlength = 1;
9688
2
      itemlength = 2;
9689
2
      }
9690
371
    else if (escape > ESC_b && escape < ESC_Z)
9691
271
      {
9692
271
#if PCRE2_CODE_UNIT_WIDTH != 32
9693
271
      if ((cb->external_options & PCRE2_UTF) != 0 && escape == ESC_C)
9694
0
        {
9695
0
        *errcodeptr = ERR36;
9696
0
        return -1;
9697
0
        }
9698
271
#endif
9699
271
      itemlength = itemminlength = 1;
9700
271
      if (escape == ESC_p || escape == ESC_P) pptr++;  /* Skip prop data */
9701
271
      }
9702
373
    break;
9703
9704
    /* Lookaheads do not contribute to the length of this branch, but they may
9705
    contain lookbehinds within them whose lengths need to be set. */
9706
9707
373
    case META_LOOKAHEAD:
9708
116
    case META_LOOKAHEADNOT:
9709
116
    case META_LOOKAHEAD_NA:
9710
116
    case META_SCS:
9711
116
    *errcodeptr = check_lookbehinds(pptr + 1, &pptr, recurses, cb, lcptr);
9712
116
    if (*errcodeptr != 0) return -1;
9713
9714
    /* Ignore any qualifiers that follow a lookahead assertion. */
9715
9716
116
    switch (pptr[1])
9717
116
      {
9718
1
      case META_ASTERISK:
9719
1
      case META_ASTERISK_PLUS:
9720
1
      case META_ASTERISK_QUERY:
9721
1
      case META_PLUS:
9722
1
      case META_PLUS_PLUS:
9723
1
      case META_PLUS_QUERY:
9724
3
      case META_QUERY:
9725
3
      case META_QUERY_PLUS:
9726
3
      case META_QUERY_QUERY:
9727
3
      pptr++;
9728
3
      break;
9729
9730
1
      case META_MINMAX:
9731
1
      case META_MINMAX_PLUS:
9732
1
      case META_MINMAX_QUERY:
9733
1
      pptr += 3;
9734
1
      break;
9735
9736
112
      default:
9737
112
      break;
9738
116
      }
9739
116
    break;
9740
9741
    /* A nested lookbehind does not contribute any length to this lookbehind,
9742
    but must itself be checked and have its lengths set. Note that
9743
    set_lookbehind_lengths() updates pptr, leaving it pointing to the final ket
9744
    of the group, so no need to update it here. */
9745
9746
116
    case META_LOOKBEHIND:
9747
160
    case META_LOOKBEHINDNOT:
9748
160
    case META_LOOKBEHIND_NA:
9749
160
    if (!set_lookbehind_lengths(&pptr, errcodeptr, lcptr, recurses, cb))
9750
6
      return -1;
9751
154
    break;
9752
9753
    /* Back references and recursions are handled by very similar code. At this
9754
    stage, the names generated in the parsing pass are available, but the main
9755
    name table has not yet been created. So for the named varieties, scan the
9756
    list of names in order to get the number of the first one in the pattern,
9757
    and whether or not this name is duplicated. */
9758
9759
154
    case META_BACKREF_BYNAME:
9760
3
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0)
9761
0
      goto ISNOTFIXED;
9762
3
    PCRE2_FALLTHROUGH /* Fall through */
9763
3
9764
4
    case META_RECURSE_BYNAME:
9765
4
      {
9766
4
      PCRE2_SPTR name;
9767
4
      BOOL is_dupname = FALSE;
9768
4
      named_group *ng;
9769
4
      uint32_t meta_code = META_CODE(*pptr);
9770
4
      uint32_t length = *(++pptr);
9771
9772
4
      GETPLUSOFFSET(offset, pptr);
9773
4
      name = cb->start_pattern + offset;
9774
4
      ng = PRIV(compile_find_named_group)(name, length, cb);
9775
9776
4
      if (ng == NULL)
9777
0
        {
9778
0
        *errcodeptr = ERR15;  /* Non-existent subpattern */
9779
0
        cb->erroroffset = offset;
9780
0
        return -1;
9781
0
        }
9782
9783
4
      group = ng->number;
9784
4
      is_dupname = (ng->hash_dup & NAMED_GROUP_IS_DUPNAME) != 0;
9785
9786
      /* A numerical back reference can be fixed length if duplicate capturing
9787
      groups are not being used. A non-duplicate named back reference can also
9788
      be handled. */
9789
9790
4
      if (meta_code == META_RECURSE_BYNAME ||
9791
3
          (!is_dupname && (cb->external_flags & PCRE2_DUPCAPUSED) == 0))
9792
1
        goto RECURSE_OR_BACKREF_LENGTH;  /* Handle as a numbered version. */
9793
4
      }
9794
3
    goto ISNOTFIXED;                     /* Duplicate name or number */
9795
9796
    /* The offset values for back references < 10 are in a separate vector
9797
    because otherwise they would use more than two parsed pattern elements on
9798
    64-bit systems. */
9799
9800
9
    case META_BACKREF:
9801
9
    if ((cb->external_options & PCRE2_MATCH_UNSET_BACKREF) != 0 ||
9802
9
        (cb->external_flags & PCRE2_DUPCAPUSED) != 0)
9803
0
      goto ISNOTFIXED;
9804
9
    group = META_DATA(*pptr);
9805
9
    if (group < 10)
9806
7
      {
9807
7
      offset = cb->small_ref_offset[group];
9808
7
      goto RECURSE_OR_BACKREF_LENGTH;
9809
7
      }
9810
9811
2
    PCRE2_FALLTHROUGH /* Fall through */
9812
2
    /* For groups >= 10 - picking up group twice does no harm. */
9813
2
9814
2
    /* A true recursion implies not fixed length, but a subroutine call may
9815
2
    be OK. Back reference "recursions" are also failed. */
9816
2
9817
297
    case META_RECURSE:
9818
297
    group = META_DATA(*pptr);
9819
297
    GETPLUSOFFSET(offset, pptr);
9820
9821
305
    RECURSE_OR_BACKREF_LENGTH:
9822
305
    if (group > cb->bracount)
9823
16
      {
9824
16
      cb->erroroffset = offset;
9825
16
      *errcodeptr = ERR15;  /* Non-existent subpattern */
9826
16
      return -1;
9827
16
      }
9828
289
    if (group == 0) goto ISNOTFIXED;  /* Local recursion */
9829
20.3k
    for (gptr = cb->parsed_pattern; *gptr != META_END; gptr++)
9830
20.3k
      {
9831
20.3k
      if (META_CODE(*gptr) == META_BIGVALUE) gptr++;
9832
20.3k
        else if (*gptr == (META_CAPTURE | group)) break;
9833
20.3k
      }
9834
9835
    /* We must start the search for the end of the group at the first meta code
9836
    inside the group. Otherwise it will be treated as an enclosed group. */
9837
9838
256
    gptrend = parsed_skip(gptr + 1, PSKIP_KET);
9839
256
    if (gptrend == NULL) goto PARSED_SKIP_FAILED;
9840
256
    if (pptr > gptr && pptr < gptrend) goto ISNOTFIXED;  /* Local recursion */
9841
1.09k
    for (r = recurses; r != NULL; r = r->prev) if (r->groupptr == gptr) break;
9842
247
    if (r != NULL) goto ISNOTFIXED;   /* Mutual recursion */
9843
243
    this_recurse.prev = recurses;
9844
243
    this_recurse.groupptr = gptr;
9845
9846
    /* We do not need to know the position of the end of the group, that is,
9847
    gptr is not used after the call to get_grouplength(). Setting the second
9848
    argument FALSE stops it scanning for the end when the length can be found
9849
    in the cache. */
9850
9851
243
    gptr++;
9852
243
    grouplength = get_grouplength(&gptr, &groupminlength, FALSE, errcodeptr,
9853
243
      lcptr, group, &this_recurse, cb);
9854
243
    if (grouplength < 0)
9855
121
      {
9856
121
      if (*errcodeptr == 0) goto ISNOTFIXED;
9857
121
      return -1;  /* Error already set */
9858
121
      }
9859
122
    itemlength = grouplength;
9860
122
    itemminlength = groupminlength;
9861
122
    break;
9862
9863
    /* A (DEFINE) group is never obeyed inline and so it does not contribute to
9864
    the length of this branch. Skip from the following item to the next
9865
    unpaired ket. */
9866
9867
8
    case META_COND_DEFINE:
9868
8
    pptr = parsed_skip(pptr + 1, PSKIP_KET);
9869
8
    break;
9870
9871
    /* Check other nested groups - advance past the initial data for each type
9872
    and then seek a fixed length with get_grouplength(). */
9873
9874
8
    case META_COND_NAME:
9875
19
    case META_COND_NUMBER:
9876
27
    case META_COND_RNAME:
9877
36
    case META_COND_RNUMBER:
9878
36
    pptr += 2 + SIZEOFFSET;
9879
36
    goto CHECK_GROUP;
9880
9881
24
    case META_COND_ASSERT:
9882
24
    pptr += 1;
9883
24
    goto CHECK_GROUP;
9884
9885
0
    case META_COND_VERSION:
9886
0
    pptr += 4;
9887
0
    goto CHECK_GROUP;
9888
9889
1.40k
    case META_CAPTURE:
9890
1.40k
    group = META_DATA(*pptr);
9891
1.40k
    PCRE2_FALLTHROUGH /* Fall through */
9892
9893
1.40k
    case META_ATOMIC:
9894
1.45k
    case META_NOCAPTURE:
9895
1.45k
    case META_SCRIPT_RUN:
9896
1.45k
    pptr++;
9897
1.51k
    CHECK_GROUP:
9898
1.51k
    grouplength = get_grouplength(&pptr, &groupminlength, TRUE, errcodeptr,
9899
1.51k
      lcptr, group, recurses, cb);
9900
1.51k
    if (grouplength < 0) return -1;
9901
1.31k
    itemlength = grouplength;
9902
1.31k
    itemminlength = groupminlength;
9903
1.31k
    break;
9904
9905
9
    case META_QUERY:
9906
15
    case META_QUERY_PLUS:
9907
22
    case META_QUERY_QUERY:
9908
22
    min = 0;
9909
22
    max = 1;
9910
22
    goto REPETITION;
9911
9912
    /* Exact repetition is OK; variable repetition is not. A repetition of zero
9913
    must subtract the length that has already been added. */
9914
9915
907
    case META_MINMAX:
9916
927
    case META_MINMAX_PLUS:
9917
946
    case META_MINMAX_QUERY:
9918
946
    min = pptr[1];
9919
946
    max = pptr[2];
9920
946
    pptr += 2;
9921
9922
968
    REPETITION:
9923
968
    if (max != REPEAT_UNLIMITED)
9924
955
      {
9925
955
      if (lastitemlength != 0 &&  /* Should not occur, but just in case */
9926
709
          max != 0 &&
9927
708
          (INT_MAX - branchlength)/lastitemlength < max - 1)
9928
0
        {
9929
0
        *errcodeptr = ERR87;  /* Integer overflow; lookbehind too big */
9930
0
        return -1;
9931
0
        }
9932
955
      if (min == 0) branchminlength -= lastitemminlength;
9933
921
        else itemminlength = (min - 1) * lastitemminlength;
9934
955
      if (max == 0) branchlength -= lastitemlength;
9935
951
        else itemlength = (max - 1) * lastitemlength;
9936
955
      break;
9937
955
      }
9938
13
    PCRE2_FALLTHROUGH /* Fall through */
9939
13
9940
13
    /* Any other item means this branch does not have a fixed length. */
9941
13
9942
71
    default:
9943
120
    ISNOTFIXED:
9944
120
    *errcodeptr = ERR25;   /* Not fixed length */
9945
120
    return -1;
9946
6.74k
    }
9947
9948
  /* Add the item length to the branchlength, checking for integer overflow and
9949
  for the branch length exceeding the overall limit. Later, if there is at
9950
  least one variable-length branch in the group, there is a test for the
9951
  (smaller) variable-length branch length limit. */
9952
9953
9.57k
  if (INT_MAX - branchlength < (int)itemlength ||
9954
9.57k
      (branchlength += itemlength) > LOOKBEHIND_MAX)
9955
193
    {
9956
193
    *errcodeptr = ERR87;
9957
193
    return -1;
9958
193
    }
9959
9960
9.37k
  branchminlength += itemminlength;
9961
9962
  /* Save this item length for use if the next item is a quantifier. */
9963
9964
9.37k
  lastitemlength = itemlength;
9965
9.37k
  lastitemminlength = itemminlength;
9966
9.37k
  }
9967
9968
2.77k
EXIT:
9969
2.77k
*pptrptr = pptr;
9970
2.77k
*minptr = branchminlength;
9971
2.77k
return branchlength;
9972
9973
/* LCOV_EXCL_START */
9974
0
PARSED_SKIP_FAILED:
9975
0
PCRE2_DEBUG_UNREACHABLE();
9976
0
*errcodeptr = ERR90;  /* Unhandled META code - internal error */
9977
0
return -1;
9978
/* LCOV_EXCL_STOP */
9979
3.43k
}
9980
9981
9982
9983
/*************************************************
9984
*        Set lengths in a lookbehind             *
9985
*************************************************/
9986
9987
/* This function is called for each lookbehind, to set the lengths in its
9988
branches. An error occurs if any branch does not have a limited maximum length
9989
that is less than the limit (65535). On exit, the pointer must be left on the
9990
final ket.
9991
9992
The function also maintains the max_lookbehind value. Any lookbehind branch
9993
that contains a nested lookbehind may actually look further back than the
9994
length of the branch. The additional amount is passed back from
9995
get_branchlength() as an "extra" value.
9996
9997
Arguments:
9998
  pptrptr     pointer to pointer in the parsed pattern
9999
  errcodeptr  pointer to error code
10000
  lcptr       pointer to loop counter
10001
  recurses    chain of recurse_check to catch mutual recursion
10002
  cb          pointer to compile block
10003
10004
Returns:      TRUE if all is well
10005
              FALSE otherwise, with error code and offset set
10006
*/
10007
10008
static BOOL
10009
set_lookbehind_lengths(uint32_t **pptrptr, int *errcodeptr, int *lcptr,
10010
  parsed_recurse_check *recurses, compile_block *cb)
10011
1.36k
{
10012
1.36k
PCRE2_SIZE offset;
10013
1.36k
uint32_t *bptr = *pptrptr;
10014
1.36k
uint32_t *gbptr = bptr;
10015
1.36k
int maxlength = 0;
10016
1.36k
int minlength = INT_MAX;
10017
1.36k
BOOL variable = FALSE;
10018
10019
1.36k
READPLUSOFFSET(offset, bptr);  /* Offset for error messages */
10020
1.36k
*pptrptr += SIZEOFFSET;
10021
10022
/* Each branch can have a different maximum length, but we can keep only a
10023
single minimum for the whole group, because there's nowhere to save individual
10024
values in the META_ALT item. */
10025
10026
1.36k
do
10027
1.70k
  {
10028
1.70k
  int branchlength, branchminlength;
10029
10030
1.70k
  *pptrptr += 1;
10031
1.70k
  branchlength = get_branchlength(pptrptr, &branchminlength, errcodeptr, lcptr,
10032
1.70k
    recurses, cb);
10033
10034
1.70k
  if (branchlength < 0)
10035
335
    {
10036
    /* The errorcode and offset may already be set from a nested lookbehind. */
10037
335
    if (*errcodeptr == 0) *errcodeptr = ERR25;
10038
335
    if (cb->erroroffset == PCRE2_UNSET) cb->erroroffset = offset;
10039
335
    return FALSE;
10040
335
    }
10041
10042
1.37k
  if (branchlength != branchminlength) variable = TRUE;
10043
1.37k
  if (branchminlength < minlength) minlength = branchminlength;
10044
1.37k
  if (branchlength > maxlength) maxlength = branchlength;
10045
1.37k
  if (branchlength > cb->max_lookbehind) cb->max_lookbehind = branchlength;
10046
1.37k
  *bptr |= branchlength;  /* branchlength never more than 65535 */
10047
1.37k
  bptr = *pptrptr;
10048
1.37k
  }
10049
1.37k
while (META_CODE(*bptr) == META_ALT);
10050
10051
/* If any branch is of variable length, the whole lookbehind is of variable
10052
length. If the maximum length of any branch exceeds the maximum for variable
10053
lookbehinds, give an error. Otherwise, the minimum length is set in the word
10054
that follows the original group META value. For a fixed-length lookbehind, this
10055
is set to LOOKBEHIND_MAX, to indicate that each branch is of a fixed (but
10056
possibly different) length. */
10057
10058
1.03k
if (variable)
10059
65
  {
10060
65
  gbptr[1] = minlength;
10061
65
  if ((PCRE2_SIZE)maxlength > cb->max_varlookbehind)
10062
13
    {
10063
13
    *errcodeptr = ERR100;
10064
13
    cb->erroroffset = offset;
10065
13
    return FALSE;
10066
13
    }
10067
65
  }
10068
968
else gbptr[1] = LOOKBEHIND_MAX;
10069
10070
1.02k
return TRUE;
10071
1.03k
}
10072
10073
10074
10075
/*************************************************
10076
*         Check parsed pattern lookbehinds       *
10077
*************************************************/
10078
10079
/* This function is called at the end of parsing a pattern if any lookbehinds
10080
were encountered. It scans the parsed pattern for them, calling
10081
set_lookbehind_lengths() for each one. At the start, the errorcode is zero and
10082
the error offset is marked unset. The enables the functions above not to
10083
override settings from deeper nestings.
10084
10085
This function is called recursively from get_branchlength() for lookaheads in
10086
order to process any lookbehinds that they may contain. It stops when it hits a
10087
non-nested closing parenthesis in this case, returning a pointer to it.
10088
10089
Arguments
10090
  pptr      points to where to start (start of pattern or start of lookahead)
10091
  retptr    if not NULL, return the ket pointer here
10092
  recurses  chain of recurse_check to catch mutual recursion
10093
  cb        points to the compile block
10094
  lcptr     points to loop counter
10095
10096
Returns:    0 on success, or an errorcode (cb->erroroffset will be set)
10097
*/
10098
10099
static int
10100
check_lookbehinds(uint32_t *pptr, uint32_t **retptr,
10101
  parsed_recurse_check *recurses, compile_block *cb, int *lcptr)
10102
1.16k
{
10103
1.16k
int errorcode = 0;
10104
1.16k
int nestlevel = 0;
10105
10106
1.16k
cb->erroroffset = PCRE2_UNSET;
10107
10108
20.3k
for (; *pptr != META_END; pptr++)
10109
19.6k
  {
10110
19.6k
  if (*pptr < META_END) continue;  /* Literal */
10111
10112
14.0k
  switch (META_CODE(*pptr))
10113
14.0k
    {
10114
    /* The following erroroffset is a bogus but safe value. This branch should
10115
    be avoided by providing a proper implementation for all supported cases
10116
    below. */
10117
10118
    /* LCOV_EXCL_START */
10119
0
    default:
10120
0
    PCRE2_DEBUG_UNREACHABLE();
10121
0
    cb->erroroffset = 0;
10122
0
    return ERR70;  /* Unrecognized meta code */
10123
    /* LCOV_EXCL_STOP */
10124
10125
151
    case META_ESCAPE:
10126
151
    if (*pptr - META_ESCAPE == ESC_P || *pptr - META_ESCAPE == ESC_p)
10127
0
      pptr += 1;    /* Skip prop data */
10128
151
    break;
10129
10130
6.00k
    case META_KET:
10131
6.00k
    if (--nestlevel < 0)
10132
116
      {
10133
116
      if (retptr != NULL) *retptr = pptr;
10134
116
      return 0;
10135
116
      }
10136
5.88k
    break;
10137
10138
5.88k
    case META_ATOMIC:
10139
5.79k
    case META_CAPTURE:
10140
5.83k
    case META_COND_ASSERT:
10141
5.83k
    case META_SCS:
10142
5.88k
    case META_LOOKAHEAD:
10143
5.88k
    case META_LOOKAHEADNOT:
10144
5.88k
    case META_LOOKAHEAD_NA:
10145
5.91k
    case META_NOCAPTURE:
10146
5.91k
    case META_SCRIPT_RUN:
10147
5.91k
    nestlevel++;
10148
5.91k
    break;
10149
10150
8
    case META_ACCEPT:
10151
217
    case META_ALT:
10152
245
    case META_ASTERISK:
10153
245
    case META_ASTERISK_PLUS:
10154
245
    case META_ASTERISK_QUERY:
10155
247
    case META_BACKREF:
10156
255
    case META_CIRCUMFLEX:
10157
279
    case META_CLASS:
10158
279
    case META_CLASS_EMPTY:
10159
279
    case META_CLASS_EMPTY_NOT:
10160
304
    case META_CLASS_END:
10161
305
    case META_CLASS_NOT:
10162
305
    case META_COMMIT:
10163
308
    case META_DOLLAR:
10164
454
    case META_DOT:
10165
454
    case META_FAIL:
10166
462
    case META_PLUS:
10167
465
    case META_PLUS_PLUS:
10168
465
    case META_PLUS_QUERY:
10169
465
    case META_PRUNE:
10170
492
    case META_QUERY:
10171
492
    case META_QUERY_PLUS:
10172
492
    case META_QUERY_QUERY:
10173
492
    case META_RANGE_ESCAPED:
10174
492
    case META_RANGE_LITERAL:
10175
492
    case META_SKIP:
10176
492
    case META_THEN:
10177
492
    break;
10178
10179
0
    case META_OFFSET:
10180
28
    case META_RECURSE:
10181
28
    pptr += SIZEOFFSET;
10182
28
    break;
10183
10184
7
    case META_BACKREF_BYNAME:
10185
7
    case META_RECURSE_BYNAME:
10186
7
    pptr += 1 + SIZEOFFSET;
10187
7
    break;
10188
10189
0
    case META_COND_DEFINE:
10190
0
    pptr += SIZEOFFSET;
10191
0
    nestlevel++;
10192
0
    break;
10193
10194
0
    case META_COND_NAME:
10195
0
    case META_COND_NUMBER:
10196
0
    case META_COND_RNAME:
10197
0
    case META_COND_RNUMBER:
10198
0
    pptr += 1 + SIZEOFFSET;
10199
0
    nestlevel++;
10200
0
    break;
10201
10202
0
    case META_COND_VERSION:
10203
0
    pptr += 3;
10204
0
    nestlevel++;
10205
0
    break;
10206
10207
0
    case META_CALLOUT_STRING:
10208
0
    pptr += 3 + SIZEOFFSET;
10209
0
    break;
10210
10211
0
    case META_BIGVALUE:
10212
0
    case META_POSIX:
10213
0
    case META_POSIX_NEG:
10214
0
    case META_CAPTURE_NAME:
10215
0
    case META_CAPTURE_NUMBER:
10216
0
    pptr += 1;
10217
0
    break;
10218
10219
42
    case META_MINMAX:
10220
42
    case META_MINMAX_QUERY:
10221
44
    case META_MINMAX_PLUS:
10222
99
    case META_OPTIONS:
10223
99
    pptr += 2;
10224
99
    break;
10225
10226
124
    case META_CALLOUT_NUMBER:
10227
124
    pptr += 3;
10228
124
    break;
10229
10230
0
    case META_MARK:
10231
0
    case META_COMMIT_ARG:
10232
0
    case META_PRUNE_ARG:
10233
0
    case META_SKIP_ARG:
10234
0
    case META_THEN_ARG:
10235
0
    pptr += 1 + pptr[1];
10236
0
    break;
10237
10238
    /* Note that set_lookbehind_lengths() updates pptr, leaving it pointing to
10239
    the final ket of the group, so no need to update it here. */
10240
10241
350
    case META_LOOKBEHIND:
10242
1.20k
    case META_LOOKBEHINDNOT:
10243
1.20k
    case META_LOOKBEHIND_NA:
10244
1.20k
    if (!set_lookbehind_lengths(&pptr, &errorcode, lcptr, recurses, cb))
10245
342
      return errorcode;
10246
866
    break;
10247
14.0k
    }
10248
14.0k
  }
10249
10250
707
return 0;
10251
1.16k
}
10252
10253
10254
10255
/*************************************************
10256
*     External function to compile a pattern     *
10257
*************************************************/
10258
10259
/* This function reads a regular expression in the form of a string and returns
10260
a pointer to a block of store holding a compiled version of the expression.
10261
10262
Arguments:
10263
  pattern       the regular expression
10264
  patlen        the length of the pattern, or PCRE2_ZERO_TERMINATED
10265
  options       option bits
10266
  errorptr      pointer to errorcode
10267
  erroroffset   pointer to error offset
10268
  ccontext      points to a compile context or is NULL
10269
10270
Returns:        pointer to compiled data block, or NULL on error,
10271
                with errorcode and erroroffset set
10272
*/
10273
10274
PCRE2_EXP_DEFN pcre2_code * PCRE2_CALL_CONVENTION
10275
pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,
10276
   int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)
10277
11.1k
{
10278
11.1k
BOOL utf;                             /* Set TRUE for UTF mode */
10279
11.1k
BOOL ucp;                             /* Set TRUE for UCP mode */
10280
11.1k
BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */
10281
11.1k
BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */
10282
11.1k
pcre2_real_code *re = NULL;           /* What we will return */
10283
11.1k
compile_block cb;                     /* "Static" compile-time data */
10284
11.1k
const uint8_t *tables;                /* Char tables base pointer */
10285
10286
11.1k
PCRE2_UCHAR null_str[1] = { 0xcd };   /* Dummy for handling null inputs */
10287
11.1k
PCRE2_UCHAR *code;                    /* Current pointer in compiled code */
10288
11.1k
PCRE2_UCHAR *codestart;               /* Start of compiled code */
10289
11.1k
PCRE2_SPTR ptr;                       /* Current pointer in pattern */
10290
11.1k
uint32_t *pptr;                       /* Current pointer in parsed pattern */
10291
10292
11.1k
PCRE2_SIZE length = 1;                /* Allow for final END opcode */
10293
11.1k
PCRE2_SIZE usedlength;                /* Actual length used */
10294
11.1k
PCRE2_SIZE re_blocksize;              /* Size of memory block */
10295
11.1k
PCRE2_SIZE parsed_size_needed;        /* Needed for parsed pattern */
10296
10297
11.1k
uint32_t firstcuflags, reqcuflags;    /* Type of first/req code unit */
10298
11.1k
uint32_t firstcu, reqcu;              /* Value of first/req code unit */
10299
11.1k
uint32_t setflags = 0;                /* NL and BSR set flags */
10300
11.1k
uint32_t xoptions;                    /* Flags from context, modified */
10301
10302
11.1k
uint32_t skipatstart;                 /* When checking (*UTF) etc */
10303
11.1k
uint32_t limit_heap  = UINT32_MAX;
10304
11.1k
uint32_t limit_match = UINT32_MAX;    /* Unset match limits */
10305
11.1k
uint32_t limit_depth = UINT32_MAX;
10306
10307
11.1k
int newline = 0;                      /* Unset; can be set by the pattern */
10308
11.1k
int bsr = 0;                          /* Unset; can be set by the pattern */
10309
11.1k
int errorcode = 0;                    /* Initialize to avoid compiler warn */
10310
11.1k
int regexrc;                          /* Return from compile */
10311
10312
11.1k
uint32_t i;                           /* Local loop counter */
10313
10314
/* Enable all optimizations by default. */
10315
11.1k
uint32_t optim_flags = ccontext != NULL ? ccontext->optimization_flags :
10316
11.1k
                                          PCRE2_OPTIMIZATION_ALL;
10317
10318
/* Comments at the head of this file explain about these variables. */
10319
10320
11.1k
uint32_t stack_groupinfo[GROUPINFO_DEFAULT_SIZE];
10321
11.1k
uint32_t stack_parsed_pattern[PARSED_PATTERN_DEFAULT_SIZE];
10322
11.1k
named_group named_groups[NAMED_GROUP_LIST_SIZE];
10323
10324
/* The workspace is used in different ways in the different compiling phases.
10325
It needs to be 16-bit aligned for the preliminary parsing scan. */
10326
10327
11.1k
uint32_t c16workspace[C16_WORK_SIZE];
10328
11.1k
PCRE2_UCHAR *cworkspace = (PCRE2_UCHAR *)c16workspace;
10329
10330
10331
/* -------------- Check arguments and set up the pattern ----------------- */
10332
10333
/* There must be error code and offset pointers. */
10334
10335
11.1k
if (errorptr == NULL)
10336
0
  {
10337
0
  if (erroroffset != NULL) *erroroffset = 0;
10338
0
  return NULL;
10339
0
  }
10340
11.1k
if (erroroffset == NULL)
10341
0
  {
10342
0
  if (errorptr != NULL) *errorptr = ERR120;
10343
0
  return NULL;
10344
0
  }
10345
11.1k
*errorptr = ERR0;
10346
11.1k
*erroroffset = 0;
10347
10348
/* There must be a pattern, but NULL is allowed with zero length. */
10349
10350
11.1k
if (pattern == NULL)
10351
0
  {
10352
0
  if (patlen == 0)
10353
0
    pattern = null_str;
10354
0
  else
10355
0
    {
10356
0
    *errorptr = ERR16;
10357
0
    return NULL;
10358
0
    }
10359
0
  }
10360
10361
/* A NULL compile context means "use a default context" */
10362
10363
11.1k
if (ccontext == NULL)
10364
11.1k
  ccontext = (pcre2_compile_context *)(&PRIV(default_compile_context));
10365
10366
/* PCRE2_MATCH_INVALID_UTF implies UTF */
10367
10368
11.1k
if ((options & PCRE2_MATCH_INVALID_UTF) != 0) options |= PCRE2_UTF;
10369
10370
/* Check that all undefined public option bits are zero. */
10371
10372
11.1k
if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0 ||
10373
11.1k
    (ccontext->extra_options & ~PUBLIC_COMPILE_EXTRA_OPTIONS) != 0)
10374
0
  {
10375
0
  *errorptr = ERR17;
10376
0
  return NULL;
10377
0
  }
10378
10379
11.1k
if ((options & PCRE2_LITERAL) != 0 &&
10380
0
    ((options & ~PUBLIC_LITERAL_COMPILE_OPTIONS) != 0 ||
10381
0
     (ccontext->extra_options & ~PUBLIC_LITERAL_COMPILE_EXTRA_OPTIONS) != 0))
10382
0
  {
10383
0
  *errorptr = ERR92;
10384
0
  return NULL;
10385
0
  }
10386
10387
/* A zero-terminated pattern is indicated by the special length value
10388
PCRE2_ZERO_TERMINATED. Check for an overlong pattern. */
10389
10390
11.1k
if ((zero_terminated = (patlen == PCRE2_ZERO_TERMINATED)))
10391
11.1k
  patlen = PRIV(strlen)(pattern);
10392
11.1k
(void)zero_terminated; /* Silence compiler; only used if Valgrind enabled */
10393
10394
11.1k
if (patlen > ccontext->max_pattern_length)
10395
0
  {
10396
0
  *errorptr = ERR88;
10397
0
  return NULL;
10398
0
  }
10399
10400
/* Optimization flags in 'options' can override those in the compile context.
10401
This is because some options to disable optimizations were added before the
10402
optimization flags word existed, and we need to continue supporting them
10403
for backwards compatibility. */
10404
10405
11.1k
if ((options & PCRE2_NO_AUTO_POSSESS) != 0)
10406
0
  optim_flags &= ~PCRE2_OPTIM_AUTO_POSSESS;
10407
11.1k
if ((options & PCRE2_NO_DOTSTAR_ANCHOR) != 0)
10408
0
  optim_flags &= ~PCRE2_OPTIM_DOTSTAR_ANCHOR;
10409
11.1k
if ((options & PCRE2_NO_START_OPTIMIZE) != 0)
10410
0
  optim_flags &= ~PCRE2_OPTIM_START_OPTIMIZE;
10411
10412
/* From here on, all returns from this function should end up going via the
10413
EXIT label. */
10414
10415
10416
/* ------------ Initialize the "static" compile data -------------- */
10417
10418
11.1k
tables = (ccontext->tables != NULL)? ccontext->tables : PRIV(default_tables);
10419
10420
11.1k
cb.lcc = tables + lcc_offset;          /* Individual */
10421
11.1k
cb.fcc = tables + fcc_offset;          /*   character */
10422
11.1k
cb.cbits = tables + cbits_offset;      /*      tables */
10423
11.1k
cb.ctypes = tables + ctypes_offset;
10424
10425
11.1k
cb.assert_depth = 0;
10426
11.1k
cb.bracount = 0;
10427
11.1k
cb.cx = ccontext;
10428
11.1k
cb.dupnames = FALSE;
10429
11.1k
cb.end_pattern = pattern + patlen;
10430
11.1k
cb.erroroffset = 0;
10431
11.1k
cb.external_flags = 0;
10432
11.1k
cb.external_options = options;
10433
11.1k
cb.groupinfo = stack_groupinfo;
10434
11.1k
cb.had_recurse = FALSE;
10435
11.1k
cb.lastcapture = 0;
10436
11.1k
cb.max_lookbehind = 0;                               /* Max encountered */
10437
11.1k
cb.max_varlookbehind = ccontext->max_varlookbehind;  /* Limit */
10438
11.1k
cb.name_entry_size = 0;
10439
11.1k
cb.name_table = NULL;
10440
11.1k
cb.named_groups = named_groups;
10441
11.1k
cb.named_group_list_size = NAMED_GROUP_LIST_SIZE;
10442
11.1k
cb.names_found = 0;
10443
11.1k
cb.parens_depth = 0;
10444
11.1k
cb.parsed_pattern = stack_parsed_pattern;
10445
11.1k
cb.req_varyopt = 0;
10446
11.1k
cb.start_code = cworkspace;
10447
11.1k
cb.start_pattern = pattern;
10448
11.1k
cb.start_workspace = cworkspace;
10449
11.1k
cb.workspace_size = COMPILE_WORK_SIZE;
10450
11.1k
cb.first_data = NULL;
10451
11.1k
cb.last_data = NULL;
10452
11.1k
#ifdef SUPPORT_WIDE_CHARS
10453
11.1k
cb.char_lists_size = 0;
10454
11.1k
#endif
10455
10456
/* Maximum back reference and backref bitmap. The bitmap records up to 31 back
10457
references to help in deciding whether (.*) can be treated as anchored or not.
10458
*/
10459
10460
11.1k
cb.top_backref = 0;
10461
11.1k
cb.backref_map = 0;
10462
10463
/* Escape sequences \1 to \9 are always back references, but as they are only
10464
two characters long, only two elements can be used in the parsed_pattern
10465
vector. The first contains the reference, and we'd like to use the second to
10466
record the offset in the pattern, so that forward references to non-existent
10467
groups can be diagnosed later with an offset. However, on 64-bit systems,
10468
PCRE2_SIZE won't fit. Instead, we have a vector of offsets for the first
10469
occurrence of \1 to \9, indexed by the second parsed_pattern value. All other
10470
references have enough space for the offset to be put into the parsed pattern.
10471
*/
10472
10473
123k
for (i = 0; i < 10; i++) cb.small_ref_offset[i] = PCRE2_UNSET;
10474
10475
10476
/* --------------- Start looking at the pattern --------------- */
10477
10478
/* Unless PCRE2_LITERAL is set, check for global one-time option settings at
10479
the start of the pattern, and remember the offset to the actual regex. With
10480
valgrind support, make the terminator of a zero-terminated pattern
10481
inaccessible. This catches bugs that would otherwise only show up for
10482
non-zero-terminated patterns. */
10483
10484
#ifdef SUPPORT_VALGRIND
10485
if (zero_terminated) VALGRIND_MAKE_MEM_NOACCESS(pattern + patlen, CU2BYTES(1));
10486
#endif
10487
10488
11.1k
xoptions = ccontext->extra_options;
10489
11.1k
ptr = pattern;
10490
11.1k
skipatstart = 0;
10491
10492
11.1k
if ((options & PCRE2_LITERAL) == 0)
10493
11.1k
  {
10494
12.5k
  while (patlen - skipatstart >= 2 &&
10495
12.4k
         ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
10496
8.71k
         ptr[skipatstart+1] == CHAR_ASTERISK)
10497
2.32k
    {
10498
41.5k
    for (i = 0; i < sizeof(pso_list)/sizeof(pso); i++)
10499
40.6k
      {
10500
40.6k
      const pso *p = pso_list + i;
10501
10502
40.6k
      if (patlen - skipatstart - 2 >= p->length &&
10503
27.7k
          PRIV(strncmp_c8)(ptr + skipatstart + 2, p->name, p->length) == 0)
10504
1.46k
        {
10505
1.46k
        uint32_t c, pp;
10506
10507
1.46k
        skipatstart += p->length + 2;
10508
1.46k
        switch(p->type)
10509
1.46k
          {
10510
297
          case PSO_OPT:
10511
297
          cb.external_options |= p->value;
10512
297
          break;
10513
10514
0
          case PSO_XOPT:
10515
0
          xoptions |= p->value;
10516
0
          break;
10517
10518
0
          case PSO_FLG:
10519
0
          setflags |= p->value;
10520
0
          break;
10521
10522
895
          case PSO_NL:
10523
895
          newline = p->value;
10524
895
          setflags |= PCRE2_NL_SET;
10525
895
          break;
10526
10527
100
          case PSO_BSR:
10528
100
          bsr = p->value;
10529
100
          setflags |= PCRE2_BSR_SET;
10530
100
          break;
10531
10532
74
          case PSO_LIMM:
10533
100
          case PSO_LIMD:
10534
100
          case PSO_LIMH:
10535
100
          c = 0;
10536
100
          pp = skipatstart;
10537
603
          while (pp < patlen && IS_DIGIT(ptr[pp]))
10538
505
            {
10539
505
            if (c > UINT32_MAX / 10 - 1) break;   /* Integer overflow */
10540
503
            c = c*10 + (ptr[pp++] - CHAR_0);
10541
503
            }
10542
100
          if (pp >= patlen || pp == skipatstart || ptr[pp] != CHAR_RIGHT_PARENTHESIS)
10543
52
            {
10544
52
            errorcode = ERR60;
10545
52
            ptr += pp;
10546
52
            utf = FALSE;  /* Used by HAD_EARLY_ERROR */
10547
52
            goto HAD_EARLY_ERROR;
10548
52
            }
10549
48
          if (p->type == PSO_LIMH) limit_heap = c;
10550
48
            else if (p->type == PSO_LIMM) limit_match = c;
10551
1
            else limit_depth = c;
10552
48
          skipatstart = ++pp;
10553
48
          break;
10554
10555
68
          case PSO_OPTMZ:
10556
68
          optim_flags &= ~(p->value);
10557
10558
          /* For backward compatibility the three original VERBs to disable
10559
          optimizations need to also update the corresponding bit in the
10560
          external options. */
10561
10562
68
          switch(p->value)
10563
68
            {
10564
34
            case PCRE2_OPTIM_AUTO_POSSESS:
10565
34
            cb.external_options |= PCRE2_NO_AUTO_POSSESS;
10566
34
            break;
10567
10568
0
            case PCRE2_OPTIM_DOTSTAR_ANCHOR:
10569
0
            cb.external_options |= PCRE2_NO_DOTSTAR_ANCHOR;
10570
0
            break;
10571
10572
34
            case PCRE2_OPTIM_START_OPTIMIZE:
10573
34
            cb.external_options |= PCRE2_NO_START_OPTIMIZE;
10574
34
            break;
10575
68
            }
10576
10577
68
          break;
10578
10579
          /* LCOV_EXCL_START */
10580
68
          default:
10581
          /* All values in the enum need an explicit entry for this switch
10582
          but until a better way to prevent coding mistakes is invented keep
10583
          a catch all that triggers a debug build assert as a failsafe */
10584
0
          PCRE2_DEBUG_UNREACHABLE();
10585
          /* LCOV_EXCL_STOP */
10586
1.46k
          }
10587
1.40k
        break;   /* Out of the table scan loop */
10588
1.46k
        }
10589
40.6k
      }
10590
2.27k
    if (i >= sizeof(pso_list)/sizeof(pso)) break;   /* Out of pso loop */
10591
2.27k
    }
10592
11.1k
    PCRE2_ASSERT(skipatstart <= patlen);
10593
11.1k
  }
10594
10595
/* End of pattern-start options; advance to start of real regex. */
10596
10597
11.1k
ptr += skipatstart;
10598
10599
/* Can't support UTF or UCP if PCRE2 was built without Unicode support. */
10600
10601
#ifndef SUPPORT_UNICODE
10602
if ((cb.external_options & (PCRE2_UTF|PCRE2_UCP)) != 0)
10603
  {
10604
  errorcode = ERR32;
10605
  goto HAD_EARLY_ERROR;
10606
  }
10607
#endif
10608
10609
/* Check UTF. We have the original options in 'options', with that value as
10610
modified by (*UTF) etc in cb->external_options. The extra option
10611
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is not permitted in UTF-16 mode because the
10612
surrogate code points cannot be represented in UTF-16. */
10613
10614
11.1k
utf = (cb.external_options & PCRE2_UTF) != 0;
10615
11.1k
if (utf)
10616
14
  {
10617
14
  if ((options & PCRE2_NEVER_UTF) != 0)
10618
0
    {
10619
0
    errorcode = ERR74;
10620
0
    goto HAD_EARLY_ERROR;
10621
0
    }
10622
14
  if ((options & PCRE2_NO_UTF_CHECK) == 0 &&
10623
14
       (errorcode = PRIV(valid_utf)(pattern, patlen, erroroffset)) != 0)
10624
0
    goto HAD_ERROR;  /* Offset was set by valid_utf() */
10625
10626
#if PCRE2_CODE_UNIT_WIDTH == 16
10627
  if ((ccontext->extra_options & PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES) != 0)
10628
    {
10629
    errorcode = ERR91;
10630
    goto HAD_EARLY_ERROR;
10631
    }
10632
#endif
10633
14
  }
10634
10635
/* Check UCP lockout. */
10636
10637
11.1k
ucp = (cb.external_options & PCRE2_UCP) != 0;
10638
11.1k
if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)
10639
0
  {
10640
0
  errorcode = ERR75;
10641
0
  goto HAD_EARLY_ERROR;
10642
0
  }
10643
10644
/* PCRE2_EXTRA_TURKISH_CASING checks */
10645
10646
11.1k
if ((xoptions & PCRE2_EXTRA_TURKISH_CASING) != 0)
10647
0
  {
10648
0
  if (!utf && !ucp)
10649
0
    {
10650
0
    errorcode = ERR104;
10651
0
    goto HAD_EARLY_ERROR;
10652
0
    }
10653
10654
0
#if PCRE2_CODE_UNIT_WIDTH == 8
10655
0
  if (!utf)
10656
0
    {
10657
0
    errorcode = ERR105;
10658
0
    goto HAD_EARLY_ERROR;
10659
0
    }
10660
0
#endif
10661
10662
0
  if ((xoptions & PCRE2_EXTRA_CASELESS_RESTRICT) != 0)
10663
0
    {
10664
0
    errorcode = ERR106;
10665
0
    goto HAD_EARLY_ERROR;
10666
0
    }
10667
0
  }
10668
10669
/* Process the BSR setting. */
10670
10671
11.1k
if (bsr == 0) bsr = ccontext->bsr_convention;
10672
10673
/* Process the newline setting. */
10674
10675
11.1k
if (newline == 0) newline = ccontext->newline_convention;
10676
11.1k
cb.nltype = NLTYPE_FIXED;
10677
11.1k
switch(newline)
10678
11.1k
  {
10679
12
  case PCRE2_NEWLINE_CR:
10680
12
  cb.nllen = 1;
10681
12
  cb.nl[0] = CHAR_CR;
10682
12
  break;
10683
10684
10.9k
  case PCRE2_NEWLINE_LF:
10685
10.9k
  cb.nllen = 1;
10686
10.9k
  cb.nl[0] = CHAR_NL;
10687
10.9k
  break;
10688
10689
0
  case PCRE2_NEWLINE_NUL:
10690
0
  cb.nllen = 1;
10691
0
  cb.nl[0] = CHAR_NUL;
10692
0
  break;
10693
10694
51
  case PCRE2_NEWLINE_CRLF:
10695
51
  cb.nllen = 2;
10696
51
  cb.nl[0] = CHAR_CR;
10697
51
  cb.nl[1] = CHAR_NL;
10698
51
  break;
10699
10700
75
  case PCRE2_NEWLINE_ANY:
10701
75
  cb.nltype = NLTYPE_ANY;
10702
75
  break;
10703
10704
42
  case PCRE2_NEWLINE_ANYCRLF:
10705
42
  cb.nltype = NLTYPE_ANYCRLF;
10706
42
  break;
10707
10708
  /* LCOV_EXCL_START */
10709
0
  default:
10710
0
  PCRE2_DEBUG_UNREACHABLE();
10711
0
  errorcode = ERR56;
10712
0
  goto HAD_EARLY_ERROR;
10713
  /* LCOV_EXCL_STOP */
10714
11.1k
  }
10715
10716
/* Pre-scan the pattern to do two things: (1) Discover the named groups and
10717
their numerical equivalents, so that this information is always available for
10718
the remaining processing. (2) At the same time, parse the pattern and put a
10719
processed version into the parsed_pattern vector. This has escapes interpreted
10720
and comments removed (amongst other things). */
10721
10722
/* Ensure that the parsed pattern buffer is big enough. For many smaller
10723
patterns the vector on the stack (which was set up above) can be used. */
10724
10725
11.1k
parsed_size_needed = max_parsed_pattern(ptr, cb.end_pattern, utf, options);
10726
10727
/* Allow for 2x uint32_t at the start and 2 at the end, for
10728
PCRE2_EXTRA_MATCH_WORD or PCRE2_EXTRA_MATCH_LINE (which are exclusive). */
10729
10730
11.1k
if ((ccontext->extra_options &
10731
11.1k
     (PCRE2_EXTRA_MATCH_WORD|PCRE2_EXTRA_MATCH_LINE)) != 0)
10732
0
  parsed_size_needed += 4;
10733
10734
/* When PCRE2_AUTO_CALLOUT is set we allow for one callout at the end. */
10735
10736
11.1k
if ((options & PCRE2_AUTO_CALLOUT) != 0)
10737
0
  parsed_size_needed += 4;
10738
10739
11.1k
parsed_size_needed += 1;  /* For the final META_END */
10740
10741
11.1k
if (parsed_size_needed > PARSED_PATTERN_DEFAULT_SIZE)
10742
231
  {
10743
231
  uint32_t *heap_parsed_pattern = ccontext->memctl.malloc(
10744
231
    parsed_size_needed * sizeof(uint32_t), ccontext->memctl.memory_data);
10745
231
  if (heap_parsed_pattern == NULL)
10746
0
    {
10747
0
    *errorptr = ERR21;
10748
0
    goto EXIT;
10749
0
    }
10750
231
  cb.parsed_pattern = heap_parsed_pattern;
10751
231
  }
10752
11.1k
cb.parsed_pattern_end = cb.parsed_pattern + parsed_size_needed;
10753
10754
/* Do the parsing scan. */
10755
10756
11.1k
errorcode = parse_regex(ptr, cb.external_options, xoptions, &has_lookbehind, &cb);
10757
11.1k
if (errorcode != 0) goto HAD_CB_ERROR;
10758
10759
/* If there are any lookbehinds, scan the parsed pattern to figure out their
10760
lengths. Workspace is needed to remember whether numbered groups are or are not
10761
of limited length, and if limited, what the minimum and maximum lengths are.
10762
This caching saves re-computing the length of any group that is referenced more
10763
than once, which is particularly relevant when recursion is involved.
10764
Unnumbered groups do not have this exposure because they cannot be referenced.
10765
If there are sufficiently few groups, the default index vector on the stack, as
10766
set up above, can be used. Otherwise we have to get/free some heap memory. The
10767
vector must be initialized to zero. */
10768
10769
7.74k
if (has_lookbehind)
10770
1.04k
  {
10771
1.04k
  int loopcount = 0;
10772
1.04k
  if (cb.bracount >= GROUPINFO_DEFAULT_SIZE/2)
10773
6
    {
10774
6
    cb.groupinfo = ccontext->memctl.malloc(
10775
6
      (2 * (cb.bracount + 1))*sizeof(uint32_t), ccontext->memctl.memory_data);
10776
6
    if (cb.groupinfo == NULL)
10777
0
      {
10778
0
      errorcode = ERR21;
10779
0
      cb.erroroffset = 0;
10780
0
      goto HAD_CB_ERROR;
10781
0
      }
10782
6
    }
10783
1.04k
  memset(cb.groupinfo, 0, (2 * cb.bracount + 1) * sizeof(uint32_t));
10784
1.04k
  errorcode = check_lookbehinds(cb.parsed_pattern, NULL, NULL, &cb, &loopcount);
10785
1.04k
  if (errorcode != 0) goto HAD_CB_ERROR;
10786
1.04k
  }
10787
10788
/* For debugging, there is a function that shows the parsed pattern vector. */
10789
10790
#ifdef DEBUG_SHOW_PARSED
10791
fprintf(stderr, "+++ Pre-scan complete:\n");
10792
show_parsed(&cb);
10793
#endif
10794
10795
/* For debugging capturing information this code can be enabled. */
10796
10797
#ifdef DEBUG_SHOW_CAPTURES
10798
  {
10799
  named_group *ng = cb.named_groups;
10800
  fprintf(stderr, "+++Captures: %d\n", cb.bracount);
10801
  for (i = 0; i < cb.names_found; i++, ng++)
10802
    {
10803
    fprintf(stderr, "+++%3d %.*s\n", ng->number, ng->length, ng->name);
10804
    }
10805
  }
10806
#endif
10807
10808
/* Pretend to compile the pattern while actually just accumulating the amount
10809
of memory required in the 'length' variable. This behaviour is triggered by
10810
passing a non-NULL final argument to compile_regex(). We pass a block of
10811
workspace (cworkspace) for it to compile parts of the pattern into; the
10812
compiled code is discarded when it is no longer needed, so hopefully this
10813
workspace will never overflow, though there is a test for its doing so.
10814
10815
On error, errorcode will be set non-zero, so we don't need to look at the
10816
result of the function. The initial options have been put into the cb block,
10817
but we still have to pass a separate options variable (the first argument)
10818
because the options may change as the pattern is processed. */
10819
10820
7.40k
cb.erroroffset = patlen;   /* For any subsequent errors that do not set it */
10821
7.40k
pptr = cb.parsed_pattern;
10822
7.40k
code = cworkspace;
10823
7.40k
*code = OP_BRA;
10824
10825
7.40k
(void)compile_regex(cb.external_options, xoptions, &code, &pptr,
10826
7.40k
   &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL, NULL,
10827
7.40k
   &cb, &length);
10828
10829
7.40k
if (errorcode != 0) goto HAD_CB_ERROR;  /* Offset is in cb.erroroffset */
10830
10831
/* This should be caught in compile_regex(), but just in case... */
10832
10833
6.24k
#if defined SUPPORT_WIDE_CHARS
10834
6.24k
PCRE2_ASSERT((cb.char_lists_size & 0x3) == 0);
10835
6.24k
if (length > MAX_PATTERN_SIZE ||
10836
6.24k
    MAX_PATTERN_SIZE - length < (cb.char_lists_size / sizeof(PCRE2_UCHAR)))
10837
#else
10838
if (length > MAX_PATTERN_SIZE)
10839
#endif
10840
0
  {
10841
0
  errorcode = ERR20;
10842
0
  cb.erroroffset = 0;
10843
0
  goto HAD_CB_ERROR;
10844
0
  }
10845
10846
/* Compute the size of, then, if not too large, get and initialize the data
10847
block for storing the compiled pattern and names table. Integer overflow should
10848
no longer be possible because nowadays we limit the maximum value of
10849
cb.names_found and cb.name_entry_size. */
10850
10851
6.24k
re_blocksize =
10852
6.24k
  CU2BYTES((PCRE2_SIZE)cb.names_found * (PCRE2_SIZE)cb.name_entry_size);
10853
10854
6.24k
#if defined SUPPORT_WIDE_CHARS
10855
6.24k
if (cb.char_lists_size != 0)
10856
0
  {
10857
0
#if PCRE2_CODE_UNIT_WIDTH != 32
10858
  /* Align to 32 bit first. This ensures the
10859
  allocated area will also be 32 bit aligned. */
10860
0
  re_blocksize = (PCRE2_SIZE)CLIST_ALIGN_TO(re_blocksize, sizeof(uint32_t));
10861
0
#endif
10862
0
  re_blocksize += cb.char_lists_size;
10863
0
  }
10864
6.24k
#endif
10865
10866
6.24k
re_blocksize += CU2BYTES(length);
10867
10868
6.24k
if (re_blocksize > ccontext->max_pattern_compiled_length)
10869
0
  {
10870
0
  errorcode = ERR101;
10871
0
  cb.erroroffset = 0;
10872
0
  goto HAD_CB_ERROR;
10873
0
  }
10874
10875
6.24k
re_blocksize += sizeof(pcre2_real_code);
10876
6.24k
re = (pcre2_real_code *)
10877
6.24k
  ccontext->memctl.malloc(re_blocksize, ccontext->memctl.memory_data);
10878
6.24k
if (re == NULL)
10879
0
  {
10880
0
  errorcode = ERR21;
10881
0
  cb.erroroffset = 0;
10882
0
  goto HAD_CB_ERROR;
10883
0
  }
10884
10885
/* The compiler may put padding at the end of the pcre2_real_code structure in
10886
order to round it up to a multiple of 4 or 8 bytes. This means that when a
10887
compiled pattern is copied (for example, when serialized) undefined bytes are
10888
read, and this annoys debuggers such as valgrind. To avoid this, we explicitly
10889
write to the last 8 bytes of the structure before setting the fields. */
10890
10891
6.24k
memset((char *)re + sizeof(pcre2_real_code) - 8, 0, 8);
10892
6.24k
re->memctl = ccontext->memctl;
10893
6.24k
re->tables = tables;
10894
6.24k
re->executable_jit = NULL;
10895
6.24k
memset(re->start_bitmap, 0, 32 * sizeof(uint8_t));
10896
6.24k
re->blocksize = re_blocksize;
10897
6.24k
re->code_start = re_blocksize - CU2BYTES(length);
10898
6.24k
re->magic_number = MAGIC_NUMBER;
10899
6.24k
re->compile_options = options;
10900
6.24k
re->overall_options = cb.external_options;
10901
6.24k
re->extra_options = xoptions;
10902
6.24k
re->flags = PCRE2_CODE_UNIT_WIDTH/8 | cb.external_flags | setflags;
10903
6.24k
re->limit_heap = limit_heap;
10904
6.24k
re->limit_match = limit_match;
10905
6.24k
re->limit_depth = limit_depth;
10906
6.24k
re->first_codeunit = 0;
10907
6.24k
re->last_codeunit = 0;
10908
6.24k
re->bsr_convention = bsr;
10909
6.24k
re->newline_convention = newline;
10910
6.24k
re->max_lookbehind = 0;
10911
6.24k
re->minlength = 0;
10912
6.24k
re->top_bracket = 0;
10913
6.24k
re->top_backref = 0;
10914
6.24k
re->name_entry_size = cb.name_entry_size;
10915
6.24k
re->name_count = cb.names_found;
10916
6.24k
re->optimization_flags = optim_flags;
10917
10918
/* The basic block is immediately followed by the name table, and the compiled
10919
code follows after that. */
10920
10921
6.24k
codestart = (PCRE2_UCHAR *)((uint8_t *)re + re->code_start);
10922
10923
/* Update the compile data block for the actual compile. The starting points of
10924
the name/number translation table and of the code are passed around in the
10925
compile data block. The start/end pattern and initial options are already set
10926
from the pre-compile phase, as is the name_entry_size field. */
10927
10928
6.24k
cb.parens_depth = 0;
10929
6.24k
cb.assert_depth = 0;
10930
6.24k
cb.lastcapture = 0;
10931
6.24k
cb.name_table = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code));
10932
6.24k
cb.start_code = codestart;
10933
6.24k
cb.req_varyopt = 0;
10934
6.24k
cb.had_accept = FALSE;
10935
6.24k
cb.had_pruneorskip = FALSE;
10936
6.24k
#ifdef SUPPORT_WIDE_CHARS
10937
6.24k
cb.char_lists_size = 0;
10938
6.24k
#endif
10939
10940
10941
/* If any named groups were found, create the name/number table from the list
10942
created in the pre-pass. */
10943
10944
6.24k
if (cb.names_found > 0)
10945
354
  {
10946
354
  named_group *ng = cb.named_groups;
10947
354
  uint32_t tablecount = 0;
10948
10949
  /* Length 0 represents duplicates, and they have already been handled. */
10950
1.53k
  for (i = 0; i < cb.names_found; i++, ng++)
10951
1.17k
    if (ng->length > 0)
10952
605
      tablecount = PRIV(compile_add_name_to_table)(&cb, ng, tablecount);
10953
10954
354
  PCRE2_ASSERT(tablecount == cb.names_found);
10955
354
  }
10956
10957
/* Set up a starting, non-extracting bracket, then compile the expression. On
10958
error, errorcode will be set non-zero, so we don't need to look at the result
10959
of the function here. */
10960
10961
6.24k
pptr = cb.parsed_pattern;
10962
6.24k
code = (PCRE2_UCHAR *)codestart;
10963
6.24k
*code = OP_BRA;
10964
6.24k
regexrc = compile_regex(re->overall_options, re->extra_options, &code,
10965
6.24k
  &pptr, &errorcode, 0, &firstcu, &firstcuflags, &reqcu, &reqcuflags, NULL,
10966
6.24k
  NULL, &cb, NULL);
10967
6.24k
if (regexrc < 0) re->flags |= PCRE2_MATCH_EMPTY;
10968
6.24k
re->top_bracket = cb.bracount;
10969
6.24k
re->top_backref = cb.top_backref;
10970
6.24k
re->max_lookbehind = cb.max_lookbehind;
10971
10972
6.24k
if (cb.had_accept)
10973
46
  {
10974
46
  reqcu = 0;                     /* Must disable after (*ACCEPT) */
10975
46
  reqcuflags = REQ_NONE;
10976
46
  re->flags |= PCRE2_HASACCEPT;  /* Disables minimum length */
10977
46
  }
10978
10979
/* Fill in the final opcode and check for disastrous overflow. If no overflow,
10980
but the estimated length exceeds the really used length, adjust the value of
10981
re->blocksize, and if valgrind support is configured, mark the extra allocated
10982
memory as unaddressable, so that any out-of-bound reads can be detected. */
10983
10984
6.24k
*code++ = OP_END;
10985
6.24k
usedlength = code - codestart;
10986
/* LCOV_EXCL_START */
10987
6.24k
if (usedlength > length)
10988
0
  {
10989
0
  PCRE2_DEBUG_UNREACHABLE();
10990
0
  errorcode = ERR23;  /* Overflow of code block - internal error */
10991
0
  cb.erroroffset = 0;
10992
0
  goto HAD_CB_ERROR;
10993
0
  }
10994
/* LCOV_EXCL_STOP */
10995
10996
6.24k
re->blocksize -= CU2BYTES(length - usedlength);
10997
#ifdef SUPPORT_VALGRIND
10998
VALGRIND_MAKE_MEM_NOACCESS(code, CU2BYTES(length - usedlength));
10999
#endif
11000
11001
/* Scan the pattern for recursion/subroutine calls and convert the group
11002
numbers into offsets. Maintain a small cache so that repeated groups containing
11003
recursions are efficiently handled. */
11004
11005
6.24k
#define RSCAN_CACHE_SIZE 8
11006
11007
6.24k
if (errorcode == 0 && cb.had_recurse)
11008
465
  {
11009
465
  PCRE2_UCHAR *rcode;
11010
465
  PCRE2_SPTR rgroup;
11011
465
  unsigned int ccount = 0;
11012
465
  int start = RSCAN_CACHE_SIZE;
11013
465
  recurse_cache rc[RSCAN_CACHE_SIZE];
11014
11015
465
  for (rcode = find_recurse(codestart, utf);
11016
99.6k
       rcode != NULL;
11017
99.1k
       rcode = find_recurse(rcode + 1 + LINK_SIZE, utf))
11018
99.1k
    {
11019
99.1k
    int p, groupnumber;
11020
11021
99.1k
    groupnumber = (int)GET(rcode, 1);
11022
99.1k
    if (groupnumber == 0) rgroup = codestart; else
11023
94.6k
      {
11024
94.6k
      PCRE2_SPTR search_from = codestart;
11025
94.6k
      rgroup = NULL;
11026
97.7k
      for (i = 0, p = start; i < ccount; i++, p = (p + 1) & 7)
11027
97.2k
        {
11028
97.2k
        if (groupnumber == rc[p].groupnumber)
11029
94.1k
          {
11030
94.1k
          rgroup = rc[p].group;
11031
94.1k
          break;
11032
94.1k
          }
11033
11034
        /* Group n+1 must always start to the right of group n, so we can save
11035
        search time below when the new group number is greater than any of the
11036
        previously found groups. */
11037
11038
3.15k
        if (groupnumber > rc[p].groupnumber) search_from = rc[p].group;
11039
3.15k
        }
11040
11041
94.6k
      if (rgroup == NULL)
11042
498
        {
11043
498
        rgroup = PRIV(find_bracket)(search_from, utf, groupnumber);
11044
        /* LCOV_EXCL_START */
11045
498
        if (rgroup == NULL)
11046
0
          {
11047
0
          PCRE2_DEBUG_UNREACHABLE();
11048
0
          errorcode = ERR53;
11049
0
          break;
11050
0
          }
11051
        /* LCOV_EXCL_STOP */
11052
11053
498
        if (--start < 0) start = RSCAN_CACHE_SIZE - 1;
11054
498
        rc[start].groupnumber = groupnumber;
11055
498
        rc[start].group = rgroup;
11056
498
        if (ccount < RSCAN_CACHE_SIZE) ccount++;
11057
498
        }
11058
94.6k
      }
11059
11060
99.1k
    PUT(rcode, 1, (uint32_t)(rgroup - codestart));
11061
99.1k
    }
11062
465
  }
11063
11064
/* In rare debugging situations we sometimes need to look at the compiled code
11065
at this stage. */
11066
11067
#ifdef DEBUG_CALL_PRINTINT
11068
pcre2_printint(re, stderr, TRUE);
11069
fprintf(stderr, "Length=%lu Used=%lu\n", length, usedlength);
11070
#endif
11071
11072
/* Unless disabled, check whether any single character iterators can be
11073
auto-possessified. The function overwrites the appropriate opcode values, so
11074
the type of the pointer must be cast. NOTE: the intermediate variable "temp" is
11075
used in this code because at least one compiler gives a warning about loss of
11076
"const" attribute if the cast (PCRE2_UCHAR *)codestart is used directly in the
11077
function call. */
11078
11079
6.24k
if (errorcode == 0 && (optim_flags & PCRE2_OPTIM_AUTO_POSSESS) != 0)
11080
6.22k
  {
11081
6.22k
  PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart;
11082
6.22k
  int possessify_rc = PRIV(auto_possessify)(temp, &cb);
11083
  /* LCOV_EXCL_START */
11084
6.22k
  if (possessify_rc != 0)
11085
0
    {
11086
0
    PCRE2_DEBUG_UNREACHABLE();
11087
0
    errorcode = ERR80;
11088
0
    cb.erroroffset = 0;
11089
0
    }
11090
  /* LCOV_EXCL_STOP */
11091
6.22k
  }
11092
11093
/* Failed to compile, or error while post-processing. */
11094
11095
6.24k
if (errorcode != 0) goto HAD_CB_ERROR;
11096
11097
/* Successful compile. If the anchored option was not passed, set it if
11098
we can determine that the pattern is anchored by virtue of ^ characters or \A
11099
or anything else, such as starting with non-atomic .* when DOTALL is set and
11100
there are no occurrences of *PRUNE or *SKIP (though there is an option to
11101
disable this case). */
11102
11103
6.23k
if ((re->overall_options & PCRE2_ANCHORED) == 0)
11104
6.23k
  {
11105
6.23k
  BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11106
6.23k
  if (is_anchored(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11107
92
    re->overall_options |= PCRE2_ANCHORED;
11108
6.23k
  }
11109
11110
/* Set up the first code unit or startline flag, the required code unit, and
11111
then study the pattern. This code need not be obeyed if PCRE2_OPTIM_START_OPTIMIZE
11112
is disabled, as the data it would create will not be used. Note that a first code
11113
unit (but not the startline flag) is useful for anchored patterns because it
11114
can still give a quick "no match" and also avoid searching for a last code
11115
unit. */
11116
11117
6.23k
if ((optim_flags & PCRE2_OPTIM_START_OPTIMIZE) != 0)
11118
6.22k
  {
11119
6.22k
  int minminlength = 0;  /* For minimal minlength from first/required CU */
11120
6.22k
  int study_rc;
11121
11122
  /* If we do not have a first code unit, see if there is one that is asserted
11123
  (these are not saved during the compile because they can cause conflicts with
11124
  actual literals that follow). */
11125
11126
6.22k
  if (firstcuflags >= REQ_NONE) {
11127
4.97k
    uint32_t assertedcuflags = 0;
11128
4.97k
    uint32_t assertedcu = find_firstassertedcu(codestart, &assertedcuflags, 0);
11129
    /* It would be wrong to use the asserted first code unit as `firstcu` for
11130
     * regexes which are able to match a 1-character string (e.g. /(?=a)b?a/)
11131
     * For that example, if we set both firstcu and reqcu to 'a', it would mean
11132
     * the subject string needs to be at least 2 characters long, which is wrong.
11133
     * With more analysis, we would be able to set firstcu in more cases. */
11134
4.97k
    if (assertedcuflags < REQ_NONE && assertedcu != reqcu) {
11135
51
      firstcu = assertedcu;
11136
51
      firstcuflags = assertedcuflags;
11137
51
    }
11138
4.97k
  }
11139
11140
  /* Save the data for a first code unit. The existence of one means the
11141
  minimum length must be at least 1. */
11142
11143
6.22k
  if (firstcuflags < REQ_NONE)
11144
1.29k
    {
11145
1.29k
    re->first_codeunit = firstcu;
11146
1.29k
    re->flags |= PCRE2_FIRSTSET;
11147
1.29k
    minminlength++;
11148
11149
    /* Handle caseless first code units. */
11150
11151
1.29k
    if ((firstcuflags & REQ_CASELESS) != 0)
11152
42
      {
11153
42
      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))
11154
40
        {
11155
40
        if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;
11156
40
        }
11157
11158
      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise.
11159
      In 8-bit UTF mode, code units in the range 128-255 are introductory code
11160
      units and cannot have another case, but if UCP is set they may do. */
11161
11162
2
#ifdef SUPPORT_UNICODE
11163
2
#if PCRE2_CODE_UNIT_WIDTH == 8
11164
2
      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu)
11165
0
        re->flags |= PCRE2_FIRSTCASELESS;
11166
#else
11167
      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&
11168
               UCD_OTHERCASE(firstcu) != firstcu)
11169
        re->flags |= PCRE2_FIRSTCASELESS;
11170
#endif
11171
42
#endif  /* SUPPORT_UNICODE */
11172
42
      }
11173
1.29k
    }
11174
11175
  /* When there is no first code unit, for non-anchored patterns, see if we can
11176
  set the PCRE2_STARTLINE flag. This is helpful for multiline matches when all
11177
  branches start with ^ and also when all branches start with non-atomic .* for
11178
  non-DOTALL matches when *PRUNE and SKIP are not present. (There is an option
11179
  that disables this case.) */
11180
11181
4.92k
  else if ((re->overall_options & PCRE2_ANCHORED) == 0)
11182
4.84k
    {
11183
4.84k
    BOOL dotstar_anchor = ((optim_flags & PCRE2_OPTIM_DOTSTAR_ANCHOR) != 0);
11184
4.84k
    if (is_startline(codestart, 0, &cb, 0, FALSE, dotstar_anchor))
11185
81
      re->flags |= PCRE2_STARTLINE;
11186
4.84k
    }
11187
11188
  /* Handle the "required code unit", if one is set. In the UTF case we can
11189
  increment the minimum minimum length only if we are sure this really is a
11190
  different character and not a non-starting code unit of the first character,
11191
  because the minimum length count is in characters, not code units. */
11192
11193
6.22k
  if (reqcuflags < REQ_NONE)
11194
1.59k
    {
11195
#if PCRE2_CODE_UNIT_WIDTH == 16
11196
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11197
        firstcuflags >= REQ_NONE ||                 /* First not set */
11198
        (firstcu & 0xf800) != 0xd800 ||             /* First not surrogate */
11199
        (reqcu & 0xfc00) != 0xdc00)                 /* Req not low surrogate */
11200
#elif PCRE2_CODE_UNIT_WIDTH == 8
11201
1.59k
    if ((re->overall_options & PCRE2_UTF) == 0 ||   /* Not UTF */
11202
0
        firstcuflags >= REQ_NONE ||                 /* First not set */
11203
0
        (firstcu & 0x80) == 0 ||                    /* First is ASCII */
11204
0
        (reqcu & 0x80) == 0)                        /* Req is ASCII */
11205
1.59k
#endif
11206
1.59k
      {
11207
1.59k
      minminlength++;
11208
1.59k
      }
11209
11210
    /* In the case of an anchored pattern, set up the value only if it follows
11211
    a variable length item in the pattern. */
11212
11213
1.59k
    if ((re->overall_options & PCRE2_ANCHORED) == 0 ||
11214
10
        (reqcuflags & REQ_VARY) != 0)
11215
1.58k
      {
11216
1.58k
      re->last_codeunit = reqcu;
11217
1.58k
      re->flags |= PCRE2_LASTSET;
11218
11219
      /* Handle caseless required code units as for first code units (above). */
11220
11221
1.58k
      if ((reqcuflags & REQ_CASELESS) != 0)
11222
60
        {
11223
60
        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))
11224
55
          {
11225
55
          if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;
11226
55
          }
11227
5
#ifdef SUPPORT_UNICODE
11228
5
#if PCRE2_CODE_UNIT_WIDTH == 8
11229
5
      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu)
11230
0
        re->flags |= PCRE2_LASTCASELESS;
11231
#else
11232
      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT &&
11233
               UCD_OTHERCASE(reqcu) != reqcu)
11234
        re->flags |= PCRE2_LASTCASELESS;
11235
#endif
11236
60
#endif  /* SUPPORT_UNICODE */
11237
60
        }
11238
1.58k
      }
11239
1.59k
    }
11240
11241
  /* Study the compiled pattern to set up information such as a bitmap of
11242
  starting code units and a minimum matching length. */
11243
11244
6.22k
  study_rc = PRIV(study)(re);
11245
  /* LCOV_EXCL_START */
11246
6.22k
  if (study_rc != 0)
11247
0
    {
11248
0
    PCRE2_DEBUG_UNREACHABLE();
11249
0
    errorcode = ERR31;
11250
0
    cb.erroroffset = 0;
11251
0
    goto HAD_CB_ERROR;
11252
0
    }
11253
  /* LCOV_EXCL_STOP */
11254
11255
  /* If study() set a bitmap of starting code units, it implies a minimum
11256
  length of at least one. */
11257
11258
6.22k
  if ((re->flags & PCRE2_FIRSTMAPSET) != 0 && minminlength == 0)
11259
601
    minminlength = 1;
11260
11261
  /* If the minimum length set (or not set) by study() is less than the minimum
11262
  implied by required code units, override it. */
11263
11264
6.22k
  if (re->minlength < minminlength) re->minlength = minminlength;
11265
6.22k
  }   /* End of start-of-match optimizations. */
11266
11267
/* Control ends up here in all cases. When running under valgrind, make a
11268
pattern's terminating zero defined again. If memory was obtained for the parsed
11269
version of the pattern, free it before returning. Also free the list of named
11270
groups if a larger one had to be obtained, and likewise the group information
11271
vector. */
11272
11273
6.23k
#ifdef SUPPORT_UNICODE
11274
/* All items must be freed. */
11275
6.23k
PCRE2_ASSERT(cb.first_data == NULL);
11276
6.23k
#endif
11277
11278
11.1k
EXIT:
11279
#ifdef SUPPORT_VALGRIND
11280
if (zero_terminated) VALGRIND_MAKE_MEM_DEFINED(pattern + patlen, CU2BYTES(1));
11281
#endif
11282
11.1k
if (cb.parsed_pattern != stack_parsed_pattern)
11283
231
  ccontext->memctl.free(cb.parsed_pattern, ccontext->memctl.memory_data);
11284
11.1k
if (cb.named_group_list_size > NAMED_GROUP_LIST_SIZE)
11285
17
  ccontext->memctl.free((void *)cb.named_groups, ccontext->memctl.memory_data);
11286
11.1k
if (cb.groupinfo != stack_groupinfo)
11287
6
  ccontext->memctl.free((void *)cb.groupinfo, ccontext->memctl.memory_data);
11288
11289
11.1k
return re;    /* Will be NULL after an error */
11290
11291
/* Errors discovered in parse_regex() set the offset value in the compile
11292
block. Errors discovered before it is called must compute it from the ptr
11293
value. After parse_regex() is called, the offset in the compile block is set to
11294
the end of the pattern, but certain errors in compile_regex() may reset it if
11295
an offset is available in the parsed pattern. */
11296
11297
4.89k
HAD_CB_ERROR:
11298
4.89k
ptr = pattern + cb.erroroffset;
11299
11300
4.95k
HAD_EARLY_ERROR:
11301
/* Ensure we don't return out-of-range erroroffset. */
11302
4.95k
PCRE2_ASSERT(ptr >= pattern);
11303
4.95k
PCRE2_ASSERT(ptr <= (pattern + patlen));
11304
/* Ensure that the erroroffset never slices a UTF-encoded character in half.
11305
If the input is invalid, then we return an offset just before the first invalid
11306
character, so the text to the left of the offset must always be valid. */
11307
#if defined PCRE2_DEBUG && defined SUPPORT_UNICODE
11308
if (ptr > pattern && utf)
11309
  {
11310
  PCRE2_SPTR prev = ptr - 1;
11311
  PCRE2_SIZE dummyoffset;
11312
  BACKCHAR(prev);
11313
  PCRE2_ASSERT(prev >= pattern);
11314
  PCRE2_ASSERT(PRIV(valid_utf)(prev, ptr - prev, &dummyoffset) == 0);
11315
  }
11316
#endif
11317
4.95k
*erroroffset = ptr - pattern;
11318
11319
4.95k
HAD_ERROR:
11320
4.95k
*errorptr = errorcode;
11321
4.95k
pcre2_code_free(re);
11322
4.95k
re = NULL;
11323
11324
4.95k
if (cb.first_data != NULL)
11325
0
  {
11326
0
  compile_data* current_data = cb.first_data;
11327
0
  do
11328
0
    {
11329
0
    compile_data* next_data = current_data->next;
11330
0
    cb.cx->memctl.free(current_data, cb.cx->memctl.memory_data);
11331
0
    current_data = next_data;
11332
0
    }
11333
0
  while (current_data != NULL);
11334
0
  }
11335
11336
4.95k
goto EXIT;
11337
4.95k
}
11338
11339
/* These #undefs are here to enable unity builds with CMake. */
11340
11341
#undef NLBLOCK /* Block containing newline information */
11342
#undef PSSTART /* Field containing processed string start */
11343
#undef PSEND   /* Field containing processed string end */
11344
11345
/* End of pcre2_compile.c */